Java Code Examples for org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch

The following examples show how to use org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: localization_nifi   Source File: OrcFlowFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
public void addRowBatch(VectorizedRowBatch batch) throws IOException {
    if (buildIndex) {
        // Batch the writes up to the rowIndexStride so that we can get the
        // right size indexes.
        int posn = 0;
        while (posn < batch.size) {
            int chunkSize = Math.min(batch.size - posn,
                    rowIndexStride - rowsInIndex);
            treeWriter.writeRootBatch(batch, posn, chunkSize);
            posn += chunkSize;
            rowsInIndex += chunkSize;
            rowsInStripe += chunkSize;
            if (rowsInIndex >= rowIndexStride) {
                createRowIndexEntry();
            }
        }
    } else {
        rowsInStripe += batch.size;
        treeWriter.writeRootBatch(batch, 0, batch.size);
    }
    memoryManager.addedRow();
}
 
Example 2
Source Project: dremio-oss   Source File: HiveORCVectorizedReader.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Helper method that creates {@link VectorizedRowBatch}. For each selected column an input vector is created in the
 * batch. For unselected columns the vector entry is going to be null. The order of input vectors in batch should
 * match the order the columns in ORC file.
 *
 * @param rowOI Used to find the ordinal of the selected column.
 * @return
 */
private VectorizedRowBatch createVectorizedRowBatch(StructObjectInspector rowOI, boolean isOriginal) {
  final List<? extends StructField> fieldRefs = rowOI.getAllStructFieldRefs();
  final List<ColumnVector> vectors = getVectors(rowOI);

  final VectorizedRowBatch result = new VectorizedRowBatch(fieldRefs.size());

  ColumnVector[] vectorArray =  vectors.toArray(new ColumnVector[0]);

  if (!isOriginal) {
    vectorArray = createTransactionalVectors(vectorArray);
  }

  result.cols = vectorArray;
  result.numCols = fieldRefs.size();
  result.reset();
  return result;
}
 
Example 3
Source Project: dremio-oss   Source File: HiveORCVectorizedReader.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Helper method that creates {@link VectorizedRowBatch}. For each selected column an input vector is created in the
 * batch. For unselected columns the vector entry is going to be null. The order of input vectors in batch should
 * match the order the columns in ORC file.
 *
 * @param rowOI Used to find the ordinal of the selected column.
 * @return
 */
private VectorizedRowBatch createVectorizedRowBatch(StructObjectInspector rowOI, boolean isOriginal) {
  final List<? extends StructField> fieldRefs = rowOI.getAllStructFieldRefs();
  final List<ColumnVector> vectors = getVectors(rowOI);

  final VectorizedRowBatch result = new VectorizedRowBatch(fieldRefs.size());

  ColumnVector[] vectorArray =  vectors.toArray(new ColumnVector[0]);

  if (!isOriginal) {
    vectorArray = createTransactionalVectors(vectorArray);
  }

  result.cols = vectorArray;
  result.numCols = fieldRefs.size();
  result.reset();
  return result;
}
 
Example 4
Source Project: datacollector   Source File: AvroToOrcRecordConverter.java    License: Apache License 2.0 6 votes vote down vote up
public static void addAvroRecord(
    VectorizedRowBatch batch,
    GenericRecord record,
    TypeDescription orcSchema,
    int orcBatchSize,
    Writer writer
) throws IOException {

  for (int c = 0; c < batch.numCols; c++) {
    ColumnVector colVector = batch.cols[c];
    final String thisField = orcSchema.getFieldNames().get(c);
    final TypeDescription type = orcSchema.getChildren().get(c);

    Object fieldValue = record.get(thisField);
    Schema.Field avroField = record.getSchema().getField(thisField);
    addToVector(type, colVector, avroField.schema(), fieldValue, batch.size);
  }

  batch.size++;

  if (batch.size % orcBatchSize == 0 || batch.size == batch.getMaxSize()) {
    writer.addRowBatch(batch);
    batch.reset();
    batch.size = 0;
  }
}
 
Example 5
Source Project: flink   Source File: OrcShimV200.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public boolean nextBatch(RecordReader reader, VectorizedRowBatch rowBatch) throws IOException {
	try {
		if (hasNextMethod == null) {
			hasNextMethod = Class.forName("org.apache.hadoop.hive.ql.io.orc.RecordReader")
					.getMethod("hasNext");
			hasNextMethod.setAccessible(true);
		}
		if (nextBatchMethod == null) {
			nextBatchMethod = RecordReader.class.getMethod("nextBatch", VectorizedRowBatch.class);
			nextBatchMethod.setAccessible(true);
		}
		boolean hasNext = (boolean) hasNextMethod.invoke(reader);
		if (hasNext) {
			nextBatchMethod.invoke(reader, rowBatch);
			return true;
		} else {
			return false;
		}
	} catch (IllegalAccessException |
			InvocationTargetException |
			NoSuchMethodException |
			ClassNotFoundException e) {
		throw new IOException(e);
	}
}
 
Example 6
Source Project: flink   Source File: OrcBulkWriterTestUtil.java    License: Apache License 2.0 6 votes vote down vote up
private static List<Record> getResults(Reader reader) throws IOException {
	List<Record> results = new ArrayList<>();

	RecordReader recordReader = reader.rows();
	VectorizedRowBatch batch = reader.getSchema().createRowBatch();

	while (recordReader.nextBatch(batch)) {
		BytesColumnVector stringVector = (BytesColumnVector)  batch.cols[0];
		LongColumnVector intVector = (LongColumnVector) batch.cols[1];
		for (int r = 0; r < batch.size; r++) {
			String name = new String(stringVector.vector[r], stringVector.start[r], stringVector.length[r]);
			int age = (int) intVector.vector[r];

			results.add(new Record(name, age));
		}
		recordReader.close();
	}

	return results;
}
 
Example 7
Source Project: nifi   Source File: OrcFlowFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
public void addRowBatch(VectorizedRowBatch batch) throws IOException {
    if (buildIndex) {
        // Batch the writes up to the rowIndexStride so that we can get the
        // right size indexes.
        int posn = 0;
        while (posn < batch.size) {
            int chunkSize = Math.min(batch.size - posn,
                    rowIndexStride - rowsInIndex);
            treeWriter.writeRootBatch(batch, posn, chunkSize);
            posn += chunkSize;
            rowsInIndex += chunkSize;
            rowsInStripe += chunkSize;
            if (rowsInIndex >= rowIndexStride) {
                createRowIndexEntry();
            }
        }
    } else {
        rowsInStripe += batch.size;
        treeWriter.writeRootBatch(batch, 0, batch.size);
    }
    memoryManager.addedRow();
}
 
Example 8
Source Project: Flink-CEPplus   Source File: OrcBatchReader.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Fills an ORC batch into an array of Row.
 *
 * @param rows The batch of rows need to be filled.
 * @param schema The schema of the ORC data.
 * @param batch The ORC data.
 * @param selectedFields The list of selected ORC fields.
 * @return The number of rows that were filled.
 */
static int fillRows(Row[] rows, TypeDescription schema, VectorizedRowBatch batch, int[] selectedFields) {

	int rowsToRead = Math.min((int) batch.count(), rows.length);

	List<TypeDescription> fieldTypes = schema.getChildren();
	// read each selected field
	for (int fieldIdx = 0; fieldIdx < selectedFields.length; fieldIdx++) {
		int orcIdx = selectedFields[fieldIdx];
		readField(rows, fieldIdx, fieldTypes.get(orcIdx), batch.cols[orcIdx], rowsToRead);
	}
	return rowsToRead;
}
 
Example 9
Source Project: flink   Source File: OrcBatchReader.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Fills an ORC batch into an array of Row.
 *
 * @param rows The batch of rows need to be filled.
 * @param schema The schema of the ORC data.
 * @param batch The ORC data.
 * @param selectedFields The list of selected ORC fields.
 * @return The number of rows that were filled.
 */
static int fillRows(Row[] rows, TypeDescription schema, VectorizedRowBatch batch, int[] selectedFields) {

	int rowsToRead = Math.min((int) batch.count(), rows.length);

	List<TypeDescription> fieldTypes = schema.getChildren();
	// read each selected field
	for (int fieldIdx = 0; fieldIdx < selectedFields.length; fieldIdx++) {
		int orcIdx = selectedFields[fieldIdx];
		readField(rows, fieldIdx, fieldTypes.get(orcIdx), batch.cols[orcIdx], rowsToRead);
	}
	return rowsToRead;
}
 
Example 10
Source Project: pxf   Source File: HiveORCVectorizedResolver.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public List<List<OneField>> getFieldsForBatch(OneRow batch) {

    VectorizedRowBatch vectorizedBatch = (VectorizedRowBatch) batch.getData();

    /* Allocate empty result set */
    int columnsNumber = context.getColumns();
    resolvedBatch = new ArrayList<>(vectorizedBatch.size);

    /* Create empty template row */
    ArrayList<OneField> templateRow = new ArrayList<OneField>(columnsNumber);
    ArrayList<OneField> currentRow;
    for (int j = 0; j < context.getColumns(); j++) {
        templateRow.add(null);
    }
    /* Replicate template row*/
    for (int i = 0; i < vectorizedBatch.size; i++) {
        currentRow = new ArrayList<>(templateRow);
        resolvedBatch.add(currentRow);
    }

    /* process all columns*/
    List<? extends StructField> allStructFieldRefs = soi.getAllStructFieldRefs();
    for (int columnIndex = 0; columnIndex < vectorizedBatch.numCols; columnIndex++) {
        ObjectInspector oi = allStructFieldRefs.get(columnIndex).getFieldObjectInspector();
        if (oi.getCategory() == Category.PRIMITIVE) {
            resolvePrimitiveColumn(columnIndex, oi, vectorizedBatch);
        } else {
            throw new UnsupportedTypeException("Unable to resolve column index:" + columnIndex
                    + ". Only primitive types are supported.");
        }
    }

    return resolvedBatch;
}
 
Example 11
@Override
public boolean next( final NullWritable key, final VectorizedRowBatch outputBatch ) throws IOException {
  outputBatch.reset();
  setting.setPartitionValues( outputBatch );

  if( indexSize <= currentIndex ){
    if( ! currentReader.hasNext() ){
      updateCounter( currentReader.getReadStats() );
      outputBatch.endOfFile = true;
      isEnd = true;
      return false;
    }
    while( ! setSpread() ){
      if( ! currentReader.hasNext() ){
        updateCounter( currentReader.getReadStats() );
        outputBatch.endOfFile = true;
        isEnd = true;
        return false;
      }
    }
  }
  int maxSize = outputBatch.getMaxSize();
  if( indexSize < currentIndex + maxSize ){
    maxSize = indexSize - currentIndex;
  }

  for( int colIndex : needColumnIds ){
    assignors[colIndex].setColumnVector( outputBatch.cols[colIndex] , currentIndexList , currentIndex , maxSize );
  }
  outputBatch.size = maxSize;

  currentIndex += maxSize;
  if( indexSize <= currentIndex && ! currentReader.hasNext() ){
    outputBatch.endOfFile = true;
  }

  return outputBatch.size > 0;
}
 
Example 12
Source Project: dremio-oss   Source File: HiveORCVectorizedReader.java    License: Apache License 2.0 5 votes vote down vote up
private ColumnVector getUnionColumnVector(UnionObjectInspector uoi) {
  ArrayList<ColumnVector> vectors = new ArrayList<>();
  List<? extends ObjectInspector> members = uoi.getObjectInspectors();
  for (ObjectInspector unionField: members) {
    vectors.add(getColumnVector(unionField));
  }
  ColumnVector[] columnVectors = vectors.toArray(new ColumnVector[0]);
  return new UnionColumnVector(VectorizedRowBatch.DEFAULT_SIZE, columnVectors);
}
 
Example 13
Source Project: dremio-oss   Source File: HiveORCVectorizedReader.java    License: Apache License 2.0 5 votes vote down vote up
private ColumnVector getStructColumnVector(StructObjectInspector soi) {
  ArrayList<ColumnVector> vectors = new ArrayList<>();
  List<? extends StructField> members = soi.getAllStructFieldRefs();
  for (StructField structField: members) {
    vectors.add(getColumnVector(structField.getFieldObjectInspector()));
  }
  ColumnVector[] columnVectors = vectors.toArray(new ColumnVector[0]);
  return new StructColumnVector(VectorizedRowBatch.DEFAULT_SIZE, columnVectors);
}
 
Example 14
Source Project: dremio-oss   Source File: HiveORCVectorizedReader.java    License: Apache License 2.0 5 votes vote down vote up
private ColumnVector getPrimitiveColumnVector(PrimitiveObjectInspector poi) {
    switch (poi.getPrimitiveCategory()) {
    case BOOLEAN:
    case BYTE:
    case SHORT:
    case INT:
    case LONG:
    case DATE:
      return new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
    case TIMESTAMP:
      return new TimestampColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
    case FLOAT:
    case DOUBLE:
      return new DoubleColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
    case BINARY:
    case STRING:
    case CHAR:
    case VARCHAR:
      return new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
    case DECIMAL:
      DecimalTypeInfo tInfo = (DecimalTypeInfo) poi.getTypeInfo();
      return new DecimalColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
        tInfo.precision(), tInfo.scale()
      );
    default:
      throw UserException.unsupportedError()
        .message("Vectorized ORC reader is not supported for datatype: %s", poi.getPrimitiveCategory())
        .build(logger);
    }
}
 
Example 15
Source Project: dremio-oss   Source File: HiveORCVectorizedReader.java    License: Apache License 2.0 5 votes vote down vote up
private ColumnVector[] createTransactionalVectors(ColumnVector[] dataVectors) {
  ColumnVector[] transVectors = new ColumnVector[6];

  transVectors[0] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
  transVectors[1] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
  transVectors[2] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
  transVectors[3] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
  transVectors[4] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);

  transVectors[5] = new StructColumnVector(dataVectors.length, dataVectors);

  return transVectors;
}
 
Example 16
Source Project: dremio-oss   Source File: HiveORCCopiers.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Helper method to create {@link ORCCopier}s based on given input, output vector types and projected column ordinals.
 *
 * @param projectedColOrdinals ordinals of the columns that we are interested in reading from the file.
 * @param output
 * @param input
 * @return
 */
public static ORCCopier[] createCopiers(final HiveColumnVectorData columnVectorData,
                                        final List<Integer> projectedColOrdinals,
                                        int[] ordinalIdsFromOrcFile,
                                        final ValueVector[] output,
                                        final VectorizedRowBatch input,
                                        boolean isOriginal,
                                        HiveOperatorContextOptions operatorContextOptions) {
  final int numColumns = output.length;
  final ORCCopier[] copiers = new ORCCopier[numColumns];
  final ColumnVector[] cols = isOriginal ? input.cols : ((StructColumnVector) input.cols[HiveORCVectorizedReader.TRANS_ROW_COLUMN_INDEX]).fields;
  for (int i = 0; i < numColumns; i++) {
    boolean copierCreated = false;
    if (i < projectedColOrdinals.size()) {
      int projectedColOrdinal = projectedColOrdinals.get(i);
      if (projectedColOrdinal < ordinalIdsFromOrcFile.length && projectedColOrdinal < cols.length) {
        int ordinalId = ordinalIdsFromOrcFile[ projectedColOrdinal ];
        copiers[i] = createCopier(columnVectorData, ordinalId, output[i], cols[projectedColOrdinal], operatorContextOptions);
        copierCreated = true;
      }
    }
    if (!copierCreated) {
      copiers[i] = new NoOpCopier(null, null);
    }
  }
  return copiers;
}
 
Example 17
Source Project: dremio-oss   Source File: HiveORCVectorizedReader.java    License: Apache License 2.0 5 votes vote down vote up
private ColumnVector getUnionColumnVector(UnionObjectInspector uoi) {
  ArrayList<ColumnVector> vectors = new ArrayList<>();
  List<? extends ObjectInspector> members = uoi.getObjectInspectors();
  for (ObjectInspector unionField: members) {
    vectors.add(getColumnVector(unionField));
  }
  ColumnVector[] columnVectors = vectors.toArray(new ColumnVector[0]);
  return new UnionColumnVector(VectorizedRowBatch.DEFAULT_SIZE, columnVectors);
}
 
Example 18
Source Project: dremio-oss   Source File: HiveORCVectorizedReader.java    License: Apache License 2.0 5 votes vote down vote up
private ColumnVector getStructColumnVector(StructObjectInspector soi) {
  ArrayList<ColumnVector> vectors = new ArrayList<>();
  List<? extends StructField> members = soi.getAllStructFieldRefs();
  for (StructField structField: members) {
    vectors.add(getColumnVector(structField.getFieldObjectInspector()));
  }
  ColumnVector[] columnVectors = vectors.toArray(new ColumnVector[0]);
  return new StructColumnVector(VectorizedRowBatch.DEFAULT_SIZE, columnVectors);
}
 
Example 19
Source Project: dremio-oss   Source File: HiveORCVectorizedReader.java    License: Apache License 2.0 5 votes vote down vote up
private ColumnVector getPrimitiveColumnVector(PrimitiveObjectInspector poi) {
    switch (poi.getPrimitiveCategory()) {
    case BOOLEAN:
    case BYTE:
    case SHORT:
    case INT:
    case LONG:
    case DATE:
      return new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
    case TIMESTAMP:
      return new TimestampColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
    case FLOAT:
    case DOUBLE:
      return new DoubleColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
    case BINARY:
    case STRING:
    case CHAR:
    case VARCHAR:
      return new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
    case DECIMAL:
      DecimalTypeInfo tInfo = (DecimalTypeInfo) poi.getTypeInfo();
      return new DecimalColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
        tInfo.precision(), tInfo.scale()
      );
    default:
      throw UserException.unsupportedError()
        .message("Vectorized ORC reader is not supported for datatype: %s", poi.getPrimitiveCategory())
        .build(logger);
    }
}
 
Example 20
Source Project: dremio-oss   Source File: HiveORCVectorizedReader.java    License: Apache License 2.0 5 votes vote down vote up
private ColumnVector[] createTransactionalVectors(ColumnVector[] dataVectors) {
  ColumnVector[] transVectors = new ColumnVector[6];

  transVectors[0] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
  transVectors[1] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
  transVectors[2] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
  transVectors[3] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
  transVectors[4] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);

  transVectors[5] = new StructColumnVector(dataVectors.length, dataVectors);

  return transVectors;
}
 
Example 21
Source Project: dremio-oss   Source File: HiveORCCopiers.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Helper method to create {@link ORCCopier}s based on given input, output vector types and projected column ordinals.
 *
 * @param projectedColOrdinals ordinals of the columns that we are interested in reading from the file.
 * @param output
 * @param input
 * @return
 */
public static ORCCopier[] createCopiers(final HiveColumnVectorData columnVectorData,
                                        final List<Integer> projectedColOrdinals,
                                        int[] ordinalIdsFromOrcFile,
                                        final ValueVector[] output,
                                        final VectorizedRowBatch input,
                                        boolean isOriginal,
                                        HiveOperatorContextOptions operatorContextOptions) {
  final int numColumns = output.length;
  final ORCCopier[] copiers = new ORCCopier[numColumns];
  final ColumnVector[] cols = isOriginal ? input.cols : ((StructColumnVector) input.cols[HiveORCVectorizedReader.TRANS_ROW_COLUMN_INDEX]).fields;
  for (int i = 0; i < numColumns; i++) {
    boolean copierCreated = false;
    if (i < projectedColOrdinals.size()) {
      int projectedColOrdinal = projectedColOrdinals.get(i);
      if (projectedColOrdinal < ordinalIdsFromOrcFile.length && projectedColOrdinal < cols.length) {
        int ordinalId = ordinalIdsFromOrcFile[ projectedColOrdinal ];
        copiers[i] = createCopier(columnVectorData, ordinalId, output[i], cols[projectedColOrdinal], operatorContextOptions);
        copierCreated = true;
      }
    }
    if (!copierCreated) {
      copiers[i] = new NoOpCopier(null, null);
    }
  }
  return copiers;
}
 
Example 22
Source Project: zeppelin   Source File: SqlInterpreterTest.java    License: Apache License 2.0 5 votes vote down vote up
public File createORCFile(int[] values) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".orc");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();
  conf.set("orc.compress", "snappy");
  TypeDescription schema = TypeDescription.fromString("struct<msg:int>");
  Writer writer = OrcFile.createWriter(path,
          OrcFile.writerOptions(conf)
                  .setSchema(schema));
  VectorizedRowBatch batch = schema.createRowBatch();
  LongColumnVector x = (LongColumnVector) batch.cols[0];
  for (int i = 0; i < values.length; ++i) {
    int row = batch.size++;
    x.vector[row] = values[i];
    // If the batch is full, write it out and start over.
    if (batch.size == batch.getMaxSize()) {
      writer.addRowBatch(batch);
      batch.reset();
    }
  }
  if (batch.size != 0) {
    writer.addRowBatch(batch);
    batch.reset();
  }
  writer.close();
  return file;
}
 
Example 23
Source Project: datacollector   Source File: OrcToSdcRecordConverter.java    License: Apache License 2.0 5 votes vote down vote up
private static void populateRecordFromRow(
    Record record,
    TypeDescription schema,
    VectorizedRowBatch batch,
    int rowNum
) {
  record.getHeader().setAttribute(ORC_SCHEMA_RECORD_ATTRIBUTE, schema.toString());
  record.set(Field.create(new LinkedHashMap<>()));
  for (int c = 0; c < batch.numCols; c++) {
    populateRecordFromRow(record, "/" + schema.getFieldNames().get(c), schema.getChildren().get(c), batch.cols[c], rowNum);
  }
}
 
Example 24
@Test
public void recordConversion() throws IOException {
  Path outputFilePath = new Path(createTempFile());

  Schema.Parser schemaParser = new Schema.Parser();
  Schema schema = schemaParser.parse(
      "{\"type\": \"record\", \"name\": \"MyRecord\", \"fields\": [{\"name\": \"first\", \"type\": \"int\"},{" +
          "\"name\": \"second\", \"type\": {\"type\": \"record\", \"name\": \"MySubRecord\", \"fields\":" +
          " [{\"name\": \"sub1\", \"type\": \"string\"}, {\"name\": \"sub2\", \"type\": \"int\"}] } }, {\"name\":" +
          " \"somedate\", \"type\": { \"type\" : \"int\", \"logicalType\": \"date\"} } ]}"
  );

  TypeDescription orcSchema = AvroToOrcSchemaConverter.getOrcSchema(schema);

  Writer orcWriter = AvroToOrcRecordConverter.createOrcWriter(
      new Properties(),
      new Configuration(),
      outputFilePath,
      orcSchema
  );

  GenericRecord avroRecord = new GenericData.Record(schema);
  avroRecord.put("first", 1);
  avroRecord.put("somedate", 17535);

  GenericData.Record subRecord = new GenericData.Record(schema.getField("second").schema());
  subRecord.put("sub1", new Utf8("value1"));
  subRecord.put("sub2", 42);

  avroRecord.put("second", subRecord);

  VectorizedRowBatch batch = orcSchema.createRowBatch();

  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord, orcSchema, 1000, orcWriter);
  orcWriter.addRowBatch(batch);
  batch.reset();
  orcWriter.close();

  // TODO: add code to read the ORC file and validate the contents
}
 
Example 25
Source Project: flink   Source File: OrcBatchReader.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Fills an ORC batch into an array of Row.
 *
 * @param rows The batch of rows need to be filled.
 * @param schema The schema of the ORC data.
 * @param batch The ORC data.
 * @param selectedFields The list of selected ORC fields.
 * @return The number of rows that were filled.
 */
static int fillRows(Row[] rows, TypeDescription schema, VectorizedRowBatch batch, int[] selectedFields) {

	int rowsToRead = Math.min((int) batch.count(), rows.length);

	List<TypeDescription> fieldTypes = schema.getChildren();
	// read each selected field
	for (int fieldIdx = 0; fieldIdx < selectedFields.length; fieldIdx++) {
		int orcIdx = selectedFields[fieldIdx];
		readField(rows, fieldIdx, fieldTypes.get(orcIdx), batch.cols[orcIdx], rowsToRead);
	}
	return rowsToRead;
}
 
Example 26
Source Project: flink   Source File: OrcShim.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Create shim from hive version.
 */
static OrcShim<VectorizedRowBatch> createShim(String hiveVersion) {
	if (hiveVersion.startsWith("2.0")) {
		return new OrcShimV200();
	} else if (hiveVersion.startsWith("2.1")) {
		return new OrcShimV210();
	} else if (hiveVersion.startsWith("2.2") ||
			hiveVersion.startsWith("2.3") ||
			hiveVersion.startsWith("3.")) {
		return new OrcShimV230();
	} else {
		throw new UnsupportedOperationException(
				"Unsupported hive version for orc shim: " + hiveVersion);
	}
}
 
Example 27
Source Project: flink   Source File: RowDataVectorizer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void vectorize(RowData row, VectorizedRowBatch batch) {
	int rowId = batch.size++;
	for (int i = 0; i < row.getArity(); ++i) {
		setColumn(rowId, batch.cols[i], fieldTypes[i], row, i);
	}
}
 
Example 28
Source Project: flink   Source File: RecordVectorizer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void vectorize(Record element, VectorizedRowBatch batch) throws IOException {
	BytesColumnVector stringVector = (BytesColumnVector) batch.cols[0];
	LongColumnVector intColVector = (LongColumnVector) batch.cols[1];

	int row = batch.size++;

	stringVector.setVal(row, element.getName().getBytes(StandardCharsets.UTF_8));
	intColVector.vector[row] = element.getAge();

	this.addUserMetadata(OrcBulkWriterTestUtil.USER_METADATA_KEY, OrcBulkWriterTestUtil.USER_METADATA_VALUE);
}
 
Example 29
Source Project: secor   Source File: JsonFieldFiller.java    License: Apache License 2.0 5 votes vote down vote up
public static void processRow(JSONWriter writer, VectorizedRowBatch batch,
        TypeDescription schema, int row) throws JSONException {
    if (schema.getCategory() == TypeDescription.Category.STRUCT) {
        List<TypeDescription> fieldTypes = schema.getChildren();
        List<String> fieldNames = schema.getFieldNames();
        writer.object();
        for (int c = 0; c < batch.cols.length; ++c) {
            writer.key(fieldNames.get(c));
            setValue(writer, batch.cols[c], fieldTypes.get(c), row);
        }
        writer.endObject();
    } else {
        setValue(writer, batch.cols[0], schema, row);
    }
}
 
Example 30
Source Project: secor   Source File: VectorColumnFiller.java    License: Apache License 2.0 5 votes vote down vote up
public static void fillRow(int rowIndex, JsonConverter[] converters,
        TypeDescription schema, VectorizedRowBatch batch, JsonObject data) {
    List<String> fieldNames = schema.getFieldNames();
    for (int c = 0; c < converters.length; ++c) {
        JsonElement field = data.get(fieldNames.get(c));
        if (field == null) {
            batch.cols[c].noNulls = false;
            batch.cols[c].isNull[rowIndex] = true;
        } else {
            converters[c].convert(field, batch.cols[c], rowIndex);
        }
    }
}