Java Code Examples for org.apache.orc.TypeDescription#fromString()

The following examples show how to use org.apache.orc.TypeDescription#fromString() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: OrcKeyComparatorTest.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
@Test
public void testSimpleComparator() throws Exception {
  OrcKeyComparator comparator = new OrcKeyComparator();
  Configuration conf = new Configuration();
  String orcSchema = "struct<i:int,j:int>";
  TypeDescription schema = TypeDescription.fromString(orcSchema);
  conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), orcSchema);
  Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), orcSchema);
  comparator.setConf(conf);

  OrcStruct record0 = createSimpleOrcStruct(schema, 1, 2);
  OrcStruct record1 = createSimpleOrcStruct(schema, 3, 4);
  OrcStruct record2 = createSimpleOrcStruct(schema, 3, 4);

  OrcKey orcKey0 = new OrcKey();
  orcKey0.key = record0;
  OrcKey orcKey1 = new OrcKey();
  orcKey1.key = record1;
  OrcKey orcKey2 = new OrcKey();
  orcKey2.key = record2;

  Assert.assertTrue(comparator.compare(orcKey0, orcKey1) < 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey2) == 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey0) > 0);
}
 
Example 2
Source File: OrcUtilsTest.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
@Test
public void testNestedWithinUnionWithDiffTag() throws Exception {
  // Construct union type with different tag for the src object dest object, check if up-convert happens correctly.
  TypeDescription structInUnionAsStruct = TypeDescription.fromString("struct<a:uniontype<struct<a:int,b:string>,int>>");
  OrcStruct structInUnionAsStructObject = (OrcStruct) OrcUtils.createValueRecursively(structInUnionAsStruct);
  OrcTestUtils
      .fillOrcStructWithFixedValue(structInUnionAsStructObject, structInUnionAsStruct, 0, intValue, stringValue, boolValue);
  Assert.assertEquals(((OrcStruct)((OrcUnion)structInUnionAsStructObject.getFieldValue("a")).getObject())
      .getFieldValue("a"), new IntWritable(intValue));

  OrcStruct structInUnionAsStructObject_2 = (OrcStruct) OrcUtils.createValueRecursively(structInUnionAsStruct);
  OrcTestUtils
      .fillOrcStructWithFixedValue(structInUnionAsStructObject_2, structInUnionAsStruct, 1, intValue, stringValue, boolValue);
  Assert.assertEquals(((OrcUnion)structInUnionAsStructObject_2.getFieldValue("a")).getObject(), new IntWritable(intValue));

  // Create a new record container, do up-convert twice and check if the value is propagated properly.
  OrcStruct container = (OrcStruct) OrcUtils.createValueRecursively(structInUnionAsStruct);
  OrcUtils.upConvertOrcStruct(structInUnionAsStructObject, container, structInUnionAsStruct);
  Assert.assertEquals(structInUnionAsStructObject, container);

  OrcUtils.upConvertOrcStruct(structInUnionAsStructObject_2, container, structInUnionAsStruct);
  Assert.assertEquals(structInUnionAsStructObject_2, container);
}
 
Example 3
Source File: OrcRowInputFormat.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
	batchSize = in.readInt();
	org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration();
	configuration.readFields(in);

	if (this.conf == null) {
		this.conf = configuration;
	}
	this.schema = TypeDescription.fromString(in.readUTF());

	this.selectedFields = new int[in.readInt()];
	for (int i = 0; i < selectedFields.length; i++) {
		this.selectedFields[i] = in.readInt();
	}

	this.conjunctPredicates = new ArrayList<>();
	int numPreds = in.readInt();
	for (int i = 0; i < numPreds; i++) {
		conjunctPredicates.add((Predicate) in.readObject());
	}
}
 
Example 4
Source File: OrcInputFormat.java    From flink with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
	batchSize = in.readInt();
	Configuration configuration = new Configuration();
	configuration.readFields(in);

	if (this.conf == null) {
		this.conf = configuration;
	}
	this.schema = TypeDescription.fromString(in.readUTF());

	this.selectedFields = new int[in.readInt()];
	for (int i = 0; i < selectedFields.length; i++) {
		this.selectedFields[i] = in.readInt();
	}

	this.conjunctPredicates = new ArrayList<>();
	int numPreds = in.readInt();
	for (int i = 0; i < numPreds; i++) {
		conjunctPredicates.add((Predicate) in.readObject());
	}
}
 
Example 5
Source File: OrcRowInputFormat.java    From flink with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
	batchSize = in.readInt();
	org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration();
	configuration.readFields(in);

	if (this.conf == null) {
		this.conf = configuration;
	}
	this.schema = TypeDescription.fromString(in.readUTF());

	this.selectedFields = new int[in.readInt()];
	for (int i = 0; i < selectedFields.length; i++) {
		this.selectedFields[i] = in.readInt();
	}

	this.conjunctPredicates = new ArrayList<>();
	int numPreds = in.readInt();
	for (int i = 0; i < numPreds; i++) {
		conjunctPredicates.add((Predicate) in.readObject());
	}
}
 
Example 6
Source File: OrcCompactionTaskTest.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
private void createTestingData(File jobDir) throws Exception {
  // Write some ORC file for compaction here.
  TypeDescription schema = TypeDescription.fromString("struct<i:int,j:int>");
  OrcStruct orcStruct_0 = (OrcStruct) OrcStruct.createValue(schema);
  orcStruct_0.setFieldValue("i", new IntWritable(1));
  orcStruct_0.setFieldValue("j", new IntWritable(2));

  OrcStruct orcStruct_1 = (OrcStruct) OrcStruct.createValue(schema);
  orcStruct_1.setFieldValue("i", new IntWritable(1));
  orcStruct_1.setFieldValue("j", new IntWritable(2));

  OrcStruct orcStruct_2 = (OrcStruct) OrcStruct.createValue(schema);
  orcStruct_2.setFieldValue("i", new IntWritable(2));
  orcStruct_2.setFieldValue("j", new IntWritable(3));

  OrcStruct orcStruct_3 = (OrcStruct) OrcStruct.createValue(schema);
  orcStruct_3.setFieldValue("i", new IntWritable(4));
  orcStruct_3.setFieldValue("j", new IntWritable(5));

  // Following pattern: FILENAME.RECORDCOUNT.EXTENSION
  File file_0 = new File(jobDir, "file_0.2." + extensionName);
  File file_1 = new File(jobDir, "file_1.2." + extensionName);

  writeOrcRecordsInFile(new Path(file_0.getAbsolutePath()), schema, ImmutableList.of(orcStruct_0, orcStruct_2));
  writeOrcRecordsInFile(new Path(file_1.getAbsolutePath()), schema, ImmutableList.of(orcStruct_1, orcStruct_3));
}
 
Example 7
Source File: OrcUtilsTest.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
/**
 * This test mostly target at the following case:
 * Schema: struct<a:array<struct<a:int,b:int>>>
 * field a was set to null by one call of "upConvertOrcStruct", but the subsequent call should still have the nested
 * field filled.
 */
public void testNestedFieldSequenceSet() throws Exception {
  TypeDescription schema = TypeDescription.fromString("struct<a:array<struct<a:int,b:int>>>");
  OrcStruct struct = (OrcStruct) OrcUtils.createValueRecursively(schema);
  OrcTestUtils.fillOrcStructWithFixedValue(struct, schema, 1, "test", true);
  OrcStruct structWithEmptyArray = (OrcStruct) OrcUtils.createValueRecursively(schema);
  OrcTestUtils.fillOrcStructWithFixedValue(structWithEmptyArray, schema, 1, "test", true);
  structWithEmptyArray.setFieldValue("a", null);
  OrcUtils.upConvertOrcStruct(structWithEmptyArray, struct, schema);
  Assert.assertEquals(struct, structWithEmptyArray);

  OrcStruct struct_2 = (OrcStruct) OrcUtils.createValueRecursively(schema);
  OrcTestUtils.fillOrcStructWithFixedValue(struct_2, schema, 2, "test", true);
  OrcUtils.upConvertOrcStruct(struct_2, struct, schema);
  Assert.assertEquals(struct, struct_2);
}
 
Example 8
Source File: OrcValueMapper.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
@Override
protected void setup(Context context)
    throws IOException, InterruptedException {
  super.setup(context);
  this.jobConf = new JobConf(context.getConfiguration());
  this.outKey = new OrcKey();
  this.outKey.configure(jobConf);
  this.outValue = new OrcValue();
  this.outValue.configure(jobConf);
  this.mrOutputSchema =
      TypeDescription.fromString(context.getConfiguration().get(OrcConf.MAPRED_INPUT_SCHEMA.getAttribute()));
  this.shuffleKeySchema =
      TypeDescription.fromString(context.getConfiguration().get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()));
}
 
Example 9
Source File: OrcNoHiveBulkWriterFactory.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public BulkWriter<RowData> create(FSDataOutputStream out) throws IOException {
	OrcFile.WriterOptions opts = OrcFile.writerOptions(new Properties(), conf);
	TypeDescription description = TypeDescription.fromString(schema);
	opts.setSchema(description);
	opts.physicalWriter(new PhysicalWriterImpl(out, opts));
	WriterImpl writer = new WriterImpl(null, new Path("."), opts);

	VectorizedRowBatch rowBatch = description.createRowBatch();
	return new BulkWriter<RowData>() {
		@Override
		public void addElement(RowData row) throws IOException {
			int rowId = rowBatch.size++;
			for (int i = 0; i < row.getArity(); ++i) {
				setColumn(rowId, rowBatch.cols[i], fieldTypes[i], row, i);
			}
			if (rowBatch.size == rowBatch.getMaxSize()) {
				writer.addRowBatch(rowBatch);
				rowBatch.reset();
			}
		}

		@Override
		public void flush() throws IOException {
			if (rowBatch.size != 0) {
				writer.addRowBatch(rowBatch);
				rowBatch.reset();
			}
		}

		@Override
		public void finish() throws IOException {
			flush();
			writer.close();
		}
	};
}
 
Example 10
Source File: OrcUtilsTest.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
public void testSchemaContains() throws Exception {
  // Simple case.
  TypeDescription struct_0 = TypeDescription.fromString("struct<a:int,b:int>");
  TypeDescription struct_1 = TypeDescription.fromString("struct<a:int>");
  Assert.assertTrue(OrcUtils.eligibleForUpConvert(struct_0, struct_1));

  // Nested schema case.
  TypeDescription struct_2 = TypeDescription.fromString("struct<a:struct<a:int,b:int>,b:struct<c:int,d:int>,c:int>");
  TypeDescription struct_3 = TypeDescription.fromString("struct<a:struct<a:int>,b:struct<c:int>,c:int>");
  Assert.assertTrue(OrcUtils.eligibleForUpConvert(struct_2, struct_3));

  // Negative case.
  TypeDescription struct_4 = TypeDescription.fromString("struct<a:struct<a:int,b:int>,b:struct<c:int,d:int>,c:int>");
  TypeDescription struct_5 = TypeDescription.fromString("struct<a:struct<a:int>,b:struct<c:int>,d:int>");
  Assert.assertFalse(OrcUtils.eligibleForUpConvert(struct_4, struct_5));
  TypeDescription struct_6 = TypeDescription.fromString("struct<a:struct<a:int>,b:struct<e:int>,c:int>");
  Assert.assertFalse(OrcUtils.eligibleForUpConvert(struct_4, struct_6));

  // Cases when target schema contains more
  TypeDescription struct_7 = TypeDescription.fromString("struct<a:struct<a:int>,b:struct<e:int,f:int>,c:int>");
  Assert.assertTrue(OrcUtils.eligibleForUpConvert(struct_6, struct_7));

  // Negative case when target schema contains more but not all of the owning schema are there in the target schema.
  // Note that struct_8 has a field "a.x".
  TypeDescription struct_8 = TypeDescription.fromString("struct<a:struct<x:int>,b:struct<e:int>,c:int>");
  TypeDescription struct_9 = TypeDescription.fromString("struct<a:struct<a:int>,b:struct<e:int,f:int>,c:int>");
  Assert.assertFalse(OrcUtils.eligibleForUpConvert(struct_8, struct_9));
}
 
Example 11
Source File: OrcColumnarRowSplitReaderNoHiveTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@Override
protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
	// NOTE: orc has field name information, so name should be same as orc
	TypeDescription schema =
			TypeDescription.fromString(
					"struct<" +
							"f0:float," +
							"f1:double," +
							"f2:timestamp," +
							"f3:tinyint," +
							"f4:smallint" +
							">");

	org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
	Configuration conf = new Configuration();

	Writer writer =
			OrcFile.createWriter(filePath,
					OrcFile.writerOptions(conf).setSchema(schema));

	VectorizedRowBatch batch = schema.createRowBatch(rowSize);
	DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
	DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
	TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
	LongColumnVector col3 = (LongColumnVector) batch.cols[3];
	LongColumnVector col4 = (LongColumnVector) batch.cols[4];

	col0.noNulls = false;
	col1.noNulls = false;
	col2.noNulls = false;
	col3.noNulls = false;
	col4.noNulls = false;
	for (int i = 0; i < rowSize - 1; i++) {
		col0.vector[i] = i;
		col1.vector[i] = i;

		Timestamp timestamp = toTimestamp(i);
		col2.time[i] = timestamp.getTime();
		col2.nanos[i] = timestamp.getNanos();

		col3.vector[i] = i;
		col4.vector[i] = i;
	}

	col0.isNull[rowSize - 1] = true;
	col1.isNull[rowSize - 1] = true;
	col2.isNull[rowSize - 1] = true;
	col3.isNull[rowSize - 1] = true;
	col4.isNull[rowSize - 1] = true;

	batch.size = rowSize;
	writer.addRowBatch(batch);
	batch.reset();
	writer.close();
}
 
Example 12
Source File: OrcCompactionTaskTest.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Test
public void basicTestWithRecompactionAndBasicSchemaEvolution() throws Exception {
  File basePath = Files.createTempDir();
  basePath.deleteOnExit();

  String minutelyPath = "Identity/MemberAccount/minutely/2017/04/03/10/20_30/run_2017-04-03-10-20";
  String hourlyPath = "Identity/MemberAccount/hourly/2017/04/03/10/";
  File jobDir = new File(basePath, minutelyPath);
  Assert.assertTrue(jobDir.mkdirs());

  // Writing some basic ORC files
  createTestingData(jobDir);

  // Writing an additional file with ** evolved schema **.
  TypeDescription evolvedSchema = TypeDescription.fromString("struct<i:int,j:int,k:int>");
  OrcStruct orcStruct_4 = (OrcStruct) OrcStruct.createValue(evolvedSchema);
  orcStruct_4.setFieldValue("i", new IntWritable(5));
  orcStruct_4.setFieldValue("j", new IntWritable(6));
  orcStruct_4.setFieldValue("k", new IntWritable(7));

  File file_2 = new File(jobDir, "file_2.1." + extensionName);
  writeOrcRecordsInFile(new Path(file_2.getAbsolutePath()), evolvedSchema, ImmutableList.of(orcStruct_4));
  // Make this is the newest.
  file_2.setLastModified(Long.MAX_VALUE);

  // Verify execution
  // Overwrite the job configurator factory key.
  EmbeddedGobblin embeddedGobblin = createEmbeddedGobblin("basic", basePath.getAbsolutePath().toString())
      .setConfiguration(CompactionJobConfigurator.COMPACTION_JOB_CONFIGURATOR_FACTORY_CLASS_KEY,
      TestCompactionOrcJobConfigurator.Factory.class.getName())
      .setConfiguration(COMPACTION_OUTPUT_EXTENSION, extensionName)
      .setConfiguration(COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET, "Identity.*:0.1");
  JobExecutionResult execution = embeddedGobblin.run();
  Assert.assertTrue(execution.isSuccessful());

  // Result verification
  File outputDir = new File(basePath, hourlyPath);
  FileSystem fs = FileSystem.getLocal(new Configuration());
  List<FileStatus> statuses = new ArrayList<>();
  reloadFolder(statuses, outputDir, fs);

  Assert.assertTrue(statuses.size() == 1);
  List<OrcStruct> result = readOrcFile(statuses.get(0).getPath());
  Assert.assertEquals(result.size(), 4);
  Assert.assertEquals(result.get(0).getFieldValue("i"), new IntWritable(1));
  Assert.assertEquals(result.get(0).getFieldValue("j"), new IntWritable(2));
  Assert.assertNull(result.get(0).getFieldValue("k"));
  Assert.assertEquals(result.get(1).getFieldValue("i"), new IntWritable(2));
  Assert.assertEquals(result.get(1).getFieldValue("j"), new IntWritable(3));
  Assert.assertNull(result.get(1).getFieldValue("k"));
  Assert.assertEquals(result.get(2).getFieldValue("i"), new IntWritable(4));
  Assert.assertEquals(result.get(2).getFieldValue("j"), new IntWritable(5));
  Assert.assertNull(result.get(2).getFieldValue("k"));
  Assert.assertEquals(result.get(3).getFieldValue("i"), new IntWritable(5));
  Assert.assertEquals(result.get(3).getFieldValue("j"), new IntWritable(6));
  Assert.assertEquals(result.get(3).getFieldValue("k"), new IntWritable(7));

  // Adding new .orc file into the directory and verify if re-compaction is triggered.
  File file_late = new File(jobDir, "file_late.1." + extensionName);
  OrcStruct orcStruct_5 = (OrcStruct) OrcStruct.createValue(evolvedSchema);
  orcStruct_5.setFieldValue("i", new IntWritable(10));
  orcStruct_5.setFieldValue("j", new IntWritable(11));
  orcStruct_5.setFieldValue("k", new IntWritable(12));

  writeOrcRecordsInFile(new Path(file_late.getAbsolutePath()), evolvedSchema, ImmutableList.of(orcStruct_5));
  execution = embeddedGobblin.run();
  Assert.assertTrue(execution.isSuccessful());

  reloadFolder(statuses, outputDir, fs);
  result = readOrcFile(statuses.get(0).getPath());
  // Note previous execution's inspection gives 4 result, given re-compaction, this should gives 1 late-record more.
  Assert.assertEquals(result.size(), 4 + 1);
}
 
Example 13
Source File: Vectorizer.java    From flink with Apache License 2.0 4 votes vote down vote up
public Vectorizer(final String schema) {
	checkNotNull(schema);
	this.schema = TypeDescription.fromString(schema);
}
 
Example 14
Source File: ORCRecordExtractorTest.java    From incubator-pinot with Apache License 2.0 4 votes vote down vote up
/**
 * Create an ORC input file using the input records
 */
@Override
protected void createInputFile()
    throws IOException {
  TypeDescription schema = TypeDescription.fromString(
      "struct<user_id:int,firstName:string,lastName:string,bids:array<int>,campaignInfo:string,cost:double,timestamp:bigint>");
  Writer writer = OrcFile.createWriter(new Path(_dataFile.getAbsolutePath()),
      OrcFile.writerOptions(new Configuration()).setSchema(schema));

  int numRecords = _inputRecords.size();
  VectorizedRowBatch rowBatch = schema.createRowBatch(numRecords);
  LongColumnVector userIdVector = (LongColumnVector) rowBatch.cols[0];
  userIdVector.noNulls = false;
  BytesColumnVector firstNameVector = (BytesColumnVector) rowBatch.cols[1];
  firstNameVector.noNulls = false;
  BytesColumnVector lastNameVector = (BytesColumnVector) rowBatch.cols[2];
  ListColumnVector bidsVector = (ListColumnVector) rowBatch.cols[3];
  bidsVector.noNulls = false;
  LongColumnVector bidsElementVector = (LongColumnVector) bidsVector.child;
  bidsElementVector.ensureSize(6, false);
  BytesColumnVector campaignInfoVector = (BytesColumnVector) rowBatch.cols[4];
  DoubleColumnVector costVector = (DoubleColumnVector) rowBatch.cols[5];
  LongColumnVector timestampVector = (LongColumnVector) rowBatch.cols[6];

  for (int i = 0; i < numRecords; i++) {
    Map<String, Object> record = _inputRecords.get(i);

    Integer userId = (Integer) record.get("user_id");
    if (userId != null) {
      userIdVector.vector[i] = userId;
    } else {
      userIdVector.isNull[i] = true;
    }
    String firstName = (String) record.get("firstName");
    if (firstName != null) {
      firstNameVector.setVal(i, StringUtils.encodeUtf8(firstName));
    } else {
      firstNameVector.isNull[i] = true;
    }
    lastNameVector.setVal(i, StringUtils.encodeUtf8((String) record.get("lastName")));
    List<Integer> bids = (List<Integer>) record.get("bids");
    if (bids != null) {
      bidsVector.offsets[i] = bidsVector.childCount;
      bidsVector.lengths[i] = bids.size();
      for (int bid : bids) {
        bidsElementVector.vector[bidsVector.childCount++] = bid;
      }
    } else {
      bidsVector.isNull[i] = true;
    }
    campaignInfoVector.setVal(i, StringUtils.encodeUtf8((String) record.get("campaignInfo")));
    costVector.vector[i] = (double) record.get("cost");
    timestampVector.vector[i] = (long) record.get("timestamp");

    rowBatch.size++;
  }

  writer.addRowBatch(rowBatch);
  rowBatch.reset();
  writer.close();
}
 
Example 15
Source File: OrcCompactionTaskTest.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Test
public void testReducerSideDedup() throws Exception {
  File basePath = Files.createTempDir();
  basePath.deleteOnExit();

  String minutelyPath = "Identity/MemberAccount/minutely/2020/04/03/10/20_30/run_2020-04-03-10-20";
  String hourlyPath = "Identity/MemberAccount/hourly/2020/04/03/10/";
  File jobDir = new File(basePath, minutelyPath);
  Assert.assertTrue(jobDir.mkdirs());

  TypeDescription nestedSchema = TypeDescription.fromString("struct<a:struct<a:int,b:string,c:int>,b:string,c:uniontype<int,string>>");
  // Create three records with same value except "b" column in the top-level.
  OrcStruct nested_struct_1 = (OrcStruct) OrcUtils.createValueRecursively(nestedSchema);
  OrcTestUtils.fillOrcStructWithFixedValue(nested_struct_1, nestedSchema, 1, "test1", true);
  ((OrcStruct)nested_struct_1).setFieldValue("b", new Text("uno"));
  OrcStruct nested_struct_2 = (OrcStruct) OrcUtils.createValueRecursively(nestedSchema);
  OrcTestUtils.fillOrcStructWithFixedValue(nested_struct_2, nestedSchema, 1, "test2", true);
  ((OrcStruct)nested_struct_2).setFieldValue("b", new Text("dos"));
  OrcStruct nested_struct_3 = (OrcStruct) OrcUtils.createValueRecursively(nestedSchema);
  OrcTestUtils.fillOrcStructWithFixedValue(nested_struct_3, nestedSchema, 1, "test3", true);
  ((OrcStruct)nested_struct_3).setFieldValue("b", new Text("tres"));
  // Create another two records with different value from the above three, and these two differs in column b as well.
  OrcStruct nested_struct_4 = (OrcStruct) OrcUtils.createValueRecursively(nestedSchema);
  OrcTestUtils.fillOrcStructWithFixedValue(nested_struct_4, nestedSchema, 2, "test2", false);
  ((OrcStruct)nested_struct_4).setFieldValue("b", new Text("uno"));
  // This record will be considered as a duplication as nested_struct_4
  OrcStruct nested_struct_5 = (OrcStruct) OrcUtils.createValueRecursively(nestedSchema);
  OrcTestUtils.fillOrcStructWithFixedValue(nested_struct_5, nestedSchema, 2, "test2", false);
  ((OrcStruct)nested_struct_5).setFieldValue("b", new Text("uno"));

  // Following pattern: FILENAME.RECORDCOUNT.EXTENSION
  File file_0 = new File(jobDir, "file_0.5." + extensionName);
  writeOrcRecordsInFile(new Path(file_0.getAbsolutePath()), nestedSchema, ImmutableList.of(nested_struct_1,
      nested_struct_2, nested_struct_3, nested_struct_4, nested_struct_5));

  EmbeddedGobblin embeddedGobblin = createEmbeddedGobblin("basic", basePath.getAbsolutePath().toString())
      .setConfiguration(CompactionJobConfigurator.COMPACTION_JOB_CONFIGURATOR_FACTORY_CLASS_KEY,
          TestCompactionOrcJobConfigurator.Factory.class.getName())
      .setConfiguration(COMPACTION_OUTPUT_EXTENSION, extensionName)
      .setConfiguration(ORC_MAPPER_SHUFFLE_KEY_SCHEMA, "struct<a:struct<a:int,c:int>>");
  JobExecutionResult execution = embeddedGobblin.run();
  Assert.assertTrue(execution.isSuccessful());

  // Verifying result: Reducer should catch all the false-duplicates
  File outputDir = new File(basePath, hourlyPath);
  FileSystem fs = FileSystem.getLocal(new Configuration());
  List<FileStatus> statuses = new ArrayList<>();
  reloadFolder(statuses, outputDir, fs);
  Assert.assertEquals(statuses.size(), 1);
  List<OrcStruct> result = readOrcFile(statuses.get(0).getPath());
  // Should still contain original 3 records since they have different value in columns not included in shuffle key.
  Assert.assertEquals(result.size(), 4);
  Assert.assertTrue(result.contains(nested_struct_1));
  Assert.assertTrue(result.contains(nested_struct_2));
  Assert.assertTrue(result.contains(nested_struct_3));
  Assert.assertTrue(result.contains(nested_struct_4));
}
 
Example 16
Source File: OrcUtilsTest.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Test
public void testRandomFillOrcStructWithAnySchema() {
  // 1. Basic case
  TypeDescription schema_1 = TypeDescription.fromString("struct<i:int,j:int,k:int>");
  OrcStruct expectedStruct = (OrcStruct) OrcStruct.createValue(schema_1);
  expectedStruct.setFieldValue("i", new IntWritable(3));
  expectedStruct.setFieldValue("j", new IntWritable(3));
  expectedStruct.setFieldValue("k", new IntWritable(3));

  OrcStruct actualStruct = (OrcStruct) OrcStruct.createValue(schema_1);
  OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_1, 3, "", false);
  Assert.assertEquals(actualStruct, expectedStruct);

  TypeDescription schema_2 = TypeDescription.fromString("struct<i:boolean,j:int,k:string>");
  expectedStruct = (OrcStruct) OrcStruct.createValue(schema_2);
  expectedStruct.setFieldValue("i", new BooleanWritable(false));
  expectedStruct.setFieldValue("j", new IntWritable(3));
  expectedStruct.setFieldValue("k", new Text(""));
  actualStruct = (OrcStruct) OrcStruct.createValue(schema_2);

  OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_2, 3, "", false);
  Assert.assertEquals(actualStruct, expectedStruct);

  // 2. Some simple nested cases: struct within struct
  TypeDescription schema_3 = TypeDescription.fromString("struct<i:boolean,j:struct<i:boolean,j:int,k:string>>");
  OrcStruct expectedStruct_nested_1 = (OrcStruct) OrcStruct.createValue(schema_3);
  expectedStruct_nested_1.setFieldValue("i", new BooleanWritable(false));
  expectedStruct_nested_1.setFieldValue("j", expectedStruct);
  actualStruct = (OrcStruct) OrcStruct.createValue(schema_3);

  OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_3, 3, "", false);
  Assert.assertEquals(actualStruct, expectedStruct_nested_1);

  // 3. array of struct within struct
  TypeDescription schema_4 = TypeDescription.fromString("struct<i:boolean,j:array<struct<i:boolean,j:int,k:string>>>");
  // Note that this will not create any elements in the array.
  expectedStruct_nested_1 = (OrcStruct) OrcStruct.createValue(schema_4);
  expectedStruct_nested_1.setFieldValue("i", new BooleanWritable(false));
  OrcList list = new OrcList(schema_2, 1);
  list.add(expectedStruct);
  expectedStruct_nested_1.setFieldValue("j", list);

  // Constructing actualStruct: make sure the list is non-Empty. There's any meaningful value within placeholder struct.
  actualStruct = (OrcStruct) OrcStruct.createValue(schema_4);
  OrcList placeHolderList = new OrcList(schema_2, 1);
  OrcStruct placeHolderStruct = (OrcStruct) OrcStruct.createValue(schema_2);
  placeHolderList.add(placeHolderStruct);
  actualStruct.setFieldValue("j", placeHolderList);

  OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_4, 3, "", false);
  Assert.assertEquals(actualStruct, expectedStruct_nested_1);

  // 4. union of struct within struct
  TypeDescription schema_5 = TypeDescription.fromString("struct<i:boolean,j:uniontype<struct<i:boolean,j:int,k:string>>>");
  expectedStruct_nested_1 = (OrcStruct) OrcStruct.createValue(schema_5);
  expectedStruct_nested_1.setFieldValue("i", new BooleanWritable(false));
  OrcUnion union = new OrcUnion(schema_2);
  union.set(0, expectedStruct);
  expectedStruct_nested_1.setFieldValue("j", union);

  // Construct actualStruct: make sure there's a struct-placeholder within the union.
  actualStruct = (OrcStruct) OrcStruct.createValue(schema_5);
  OrcUnion placeHolderUnion = new OrcUnion(schema_2);
  placeHolderUnion.set(0, placeHolderStruct);
  actualStruct.setFieldValue("j", placeHolderUnion);

  OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_5, 3, "", false);
  Assert.assertEquals(actualStruct, expectedStruct_nested_1);
}
 
Example 17
Source File: OrcTableSource.java    From flink with Apache License 2.0 3 votes vote down vote up
/**
 * Sets the ORC schema of the files to read as a String.
 *
 * @param orcSchema The ORC schema of the files to read as a String.
 * @return The builder.
 */
public Builder forOrcSchema(String orcSchema) {
	Preconditions.checkNotNull(orcSchema, "ORC schema must not be null.");
	this.schema = TypeDescription.fromString(orcSchema);
	return this;
}
 
Example 18
Source File: OrcRowInputFormat.java    From flink with Apache License 2.0 2 votes vote down vote up
/**
 * Creates an OrcRowInputFormat.
 *
 * @param path The path to read ORC files from.
 * @param schemaString The schema of the ORC files as String.
 * @param orcConfig The configuration to read the ORC files with.
 * @param batchSize The number of Row objects to read in a batch.
 */
public OrcRowInputFormat(String path, String schemaString, Configuration orcConfig, int batchSize) {
	this(path, TypeDescription.fromString(schemaString), orcConfig, batchSize);
}
 
Example 19
Source File: OrcRowInputFormat.java    From Flink-CEPplus with Apache License 2.0 2 votes vote down vote up
/**
 * Creates an OrcRowInputFormat.
 *
 * @param path The path to read ORC files from.
 * @param schemaString The schema of the ORC files as String.
 * @param orcConfig The configuration to read the ORC files with.
 * @param batchSize The number of Row objects to read in a batch.
 */
public OrcRowInputFormat(String path, String schemaString, Configuration orcConfig, int batchSize) {
	this(path, TypeDescription.fromString(schemaString), orcConfig, batchSize);
}
 
Example 20
Source File: OrcRowInputFormat.java    From Flink-CEPplus with Apache License 2.0 2 votes vote down vote up
/**
 * Creates an OrcRowInputFormat.
 *
 * @param path The path to read ORC files from.
 * @param schemaString The schema of the ORC files as String.
 * @param orcConfig The configuration to read the ORC files with.
 */
public OrcRowInputFormat(String path, String schemaString, Configuration orcConfig) {
	this(path, TypeDescription.fromString(schemaString), orcConfig, DEFAULT_BATCH_SIZE);
}