org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat Java Examples

The following examples show how to use org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TPCHQuery3Parquet.java    From parquet-flinktacular with Apache License 2.0 6 votes vote down vote up
private static DataSet<Tuple2<Void, CustomerTable>> getCustomerDataSet(ExecutionEnvironment env) throws 
	IOException {
	Job job = Job.getInstance();

	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "ID;MKTSEGMENT");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		CustomerTable.class, job);

	// Filter market segment "AUTOMOBILE"
	BinaryColumn mktsegment = binaryColumn("MKTSEGMENT");
	FilterPredicate mktsegmentPred = eq(mktsegment, Binary.fromString("AUTOMOBILE"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), mktsegmentPred);

	ParquetThriftInputFormat.addInputPath(job, new Path(customerPath));

	DataSet<Tuple2<Void, CustomerTable>> data = env.createInput(hadoopInputFormat);

	return data;
}
 
Example #2
Source File: TPCDSQuery55Parquet.java    From parquet-flinktacular with Apache License 2.0 6 votes vote down vote up
private static DataSet<Tuple2<Void, DateDimTable>> getDataDimDataSet(ExecutionEnvironment env) throws IOException {
	Job job = Job.getInstance();

	//Schema projection
	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "d_date_sk;d_year;d_moy");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		DateDimTable.class, job);

	// Filter
	LongColumn moy = longColumn("d_moy");
	LongColumn year = longColumn("d_year");
	FilterPredicate moyPred = eq(moy, 11L);
	FilterPredicate yearPred = eq(year, 1999L);
	FilterPredicate constraint = and(moyPred, yearPred);
	ParquetThriftInputFormat.setFilterPredicate(job.getConfiguration(), constraint);

	ParquetThriftInputFormat.addInputPath(job, new Path(datadimPath));

	DataSet<Tuple2<Void, DateDimTable>> data = env.createInput(hadoopInputFormat);

	return data;
}
 
Example #3
Source File: TPCDSQuery55Parquet.java    From parquet-flinktacular with Apache License 2.0 6 votes vote down vote up
private static DataSet<Tuple2<Void, ItemTable>> getItemDataSet(ExecutionEnvironment env) throws IOException {
	Job job = Job.getInstance();

	//Schema projection
	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "i_item_sk;i_brand_id;i_brand;i_manager_id");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		ItemTable.class, job);
	ParquetThriftInputFormat.addInputPath(job, new Path(itemPath));

	//Filter		
	LongColumn managerId = longColumn("i_manager_id");
	FilterPredicate managerPred = eq(managerId, 28L);
	ParquetThriftInputFormat.setFilterPredicate(job.getConfiguration(), managerPred);

	DataSet<Tuple2<Void, ItemTable>> data = env.createInput(hadoopInputFormat);

	return data;
}
 
Example #4
Source File: ParquetThriftExample.java    From parquet-flinktacular with Apache License 2.0 6 votes vote down vote up
public static DataSet<Tuple2<Void, Person>> readThrift(ExecutionEnvironment env, String inputPath) throws 
	IOException {
	Job job = Job.getInstance();

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, Person
		.class, job);

	// schema projection: don't read attributes id and email
	job.getConfiguration().set("parquet.thrift.column.filter", "name;id;email;phone/number");

	FileInputFormat.addInputPath(job, new Path(inputPath));

	// push down predicates: get all persons with name = "Felix"
	BinaryColumn name = binaryColumn("name");
	FilterPredicate namePred = eq(name, Binary.fromString("Felix"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred);

	DataSet<Tuple2<Void, Person>> data = env.createInput(hadoopInputFormat);

	return data;
}
 
Example #5
Source File: WordCount.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	Job job = Job.getInstance();
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
	TextInputFormat.addInputPath(job, new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	// Tokenize the line and convert from Writable "Text" to String for better handling
	DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

	// Sum up the words
	DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

	// Convert String back to Writable "Text" for use with Hadoop Output Format
	DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
	hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
	hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
	TextOutputFormat.setOutputPath(job, new Path(outputPath));

	// Output & Execute
	hadoopResult.output(hadoopOutputFormat);
	env.execute("Word Count");
}
 
Example #6
Source File: WordCount.java    From flink with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	Job job = Job.getInstance();
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
	TextInputFormat.addInputPath(job, new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	// Tokenize the line and convert from Writable "Text" to String for better handling
	DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

	// Sum up the words
	DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

	// Convert String back to Writable "Text" for use with Hadoop Output Format
	DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
	hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
	hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
	TextOutputFormat.setOutputPath(job, new Path(outputPath));

	// Output & Execute
	hadoopResult.output(hadoopOutputFormat);
	env.execute("Word Count");
}
 
Example #7
Source File: ParquetProtobufExample.java    From parquet-flinktacular with Apache License 2.0 5 votes vote down vote up
public static DataSet<Tuple2<Void, Person.Builder>> readProtobuf(ExecutionEnvironment env, String inputPath) 
	throws IOException {
	Job job = Job.getInstance();

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ProtoParquetInputFormat(), Void.class, Person
		.Builder.class, job);

	FileInputFormat.addInputPath(job, new Path(inputPath));

	//native predicate push down: read only records which satisfy a given constraint
	BinaryColumn name = binaryColumn("name");
	FilterPredicate namePred = eq(name, Binary.fromString("Felix"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred);

	//schema projection: don't read type of phone type attribute
	String projection = "message Person {\n" +
		"  required binary name (UTF8);\n" +
		"  required int32 id;\n" +
		"  optional binary email (UTF8);\n" +
		"  repeated group phone {\n" +
		"    required binary number (UTF8);\n" +
		"  }\n" +
		"}";
	ProtoParquetInputFormat.setRequestedProjection(job, projection);

	DataSet<Tuple2<Void, Person.Builder>> data = env.createInput(hadoopInputFormat);

	return data;
}
 
Example #8
Source File: TPCDSQuery55Parquet.java    From parquet-flinktacular with Apache License 2.0 5 votes vote down vote up
private static DataSet<Tuple2<Void, StoreSalesTable>> getStoreSalesDataSet(ExecutionEnvironment env) throws 
	IOException {
	Job job = Job.getInstance();

	//Schema projection		
	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "ss_sold_date_sk;ss_item_sk;ss_ext_sales_price");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		StoreSalesTable.class, job);
	ParquetThriftInputFormat.addInputPath(job, new Path(storesalesPath));
	DataSet<Tuple2<Void, StoreSalesTable>> data = env.createInput(hadoopInputFormat);

	return data;
}
 
Example #9
Source File: WordCount.java    From flink with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	Job job = Job.getInstance();
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
	TextInputFormat.addInputPath(job, new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	// Tokenize the line and convert from Writable "Text" to String for better handling
	DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

	// Sum up the words
	DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

	// Convert String back to Writable "Text" for use with Hadoop Output Format
	DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
	hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
	hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
	TextOutputFormat.setOutputPath(job, new Path(outputPath));

	// Output & Execute
	hadoopResult.output(hadoopOutputFormat);
	env.execute("Word Count");
}
 
Example #10
Source File: ParquetAvroExample.java    From parquet-flinktacular with Apache License 2.0 4 votes vote down vote up
public static DataSet<Tuple2<Void, Person>> readAvro(ExecutionEnvironment env, String inputPath) throws
	IOException {
	Job job = Job.getInstance();

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new AvroParquetInputFormat(), Void.class, Person
		.class, job);

	FileInputFormat.addInputPath(job, new Path(inputPath));

	// schema projection: don't read type of phonenumber     
	Schema phone = Schema.createRecord("PhoneNumber", null, null, false);
	phone.setFields(Arrays.asList(
		new Schema.Field("number", Schema.create(Schema.Type.BYTES), null, null)));

	Schema array = Schema.createArray(phone);
	Schema union = Schema.createUnion(Lists.newArrayList(Schema.create(Schema.Type.BYTES), Schema.create(Schema
		.Type
		.NULL)));


	Schema projection = Schema.createRecord("Person", null, null, false);
	projection.setFields(
		Arrays.asList(
			new Schema.Field("name", Schema.create(Schema.Type.BYTES), null, null),
			new Schema.Field("id", Schema.create(Schema.Type.INT), null, null),
			new Schema.Field("email", union, null, null),
			new Schema.Field("phone", array, null, null)
		)
	);

	AvroParquetInputFormat.setRequestedProjection(job, projection);

	// push down predicates: get all persons with name = "Felix"
	BinaryColumn name = binaryColumn("name");
	FilterPredicate namePred = eq(name, Binary.fromString("Felix"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred);

	DataSet<Tuple2<Void, Person>> data = env.createInput(hadoopInputFormat);

	return data;
}
 
Example #11
Source File: TPCHQuery3Parquet.java    From parquet-flinktacular with Apache License 2.0 4 votes vote down vote up
private static DataSet<Tuple2<Void, LineitemTable>> getLineitemDataSet(ExecutionEnvironment env) throws 
	IOException {
	Job job = Job.getInstance();

	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "ORDERKEY;EXTENDEDPRICE;DISCOUNT;SHIPDATE");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		LineitemTable.class, job);

	// Filter all Lineitems with l_shipdate > 12.03.1995
	ParquetThriftInputFormat.setUnboundRecordFilter(job, LineitemFilter.class);

	ParquetThriftInputFormat.addInputPath(job, new Path(lineitemPath));

	DataSet<Tuple2<Void, LineitemTable>> data = env.createInput(hadoopInputFormat);

	return data;
}
 
Example #12
Source File: TPCHQuery3Parquet.java    From parquet-flinktacular with Apache License 2.0 4 votes vote down vote up
private static DataSet<Tuple2<Void, OrderTable>> getOrdersDataSet(ExecutionEnvironment env) throws IOException {
	Job job = Job.getInstance();

	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "ID;CUSTKEY;ORDERDATE;SHIP_PRIORITY");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		OrderTable.class, job);

	ParquetThriftInputFormat.addInputPath(job, new Path(ordersPath));

	// Filter all Orders with o_orderdate < 12.03.1995
	ParquetThriftInputFormat.setUnboundRecordFilter(job, OrderFilter.class);


	DataSet<Tuple2<Void, OrderTable>> data = env.createInput(hadoopInputFormat);

	return data;
}