org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat Java Exaples

Source File: TPCHQuery3Parquet.java From parquet-flinktacular with Apache License 2.0

6 votes

private static DataSet<Tuple2<Void, CustomerTable>> getCustomerDataSet(ExecutionEnvironment env) throws 
	IOException {
	Job job = Job.getInstance();

	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "ID;MKTSEGMENT");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		CustomerTable.class, job);

	// Filter market segment "AUTOMOBILE"
	BinaryColumn mktsegment = binaryColumn("MKTSEGMENT");
	FilterPredicate mktsegmentPred = eq(mktsegment, Binary.fromString("AUTOMOBILE"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), mktsegmentPred);

	ParquetThriftInputFormat.addInputPath(job, new Path(customerPath));

	DataSet<Tuple2<Void, CustomerTable>> data = env.createInput(hadoopInputFormat);

	return data;
}

Source File: TPCDSQuery55Parquet.java From parquet-flinktacular with Apache License 2.0

6 votes

private static DataSet<Tuple2<Void, DateDimTable>> getDataDimDataSet(ExecutionEnvironment env) throws IOException {
	Job job = Job.getInstance();

	//Schema projection
	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "d_date_sk;d_year;d_moy");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		DateDimTable.class, job);

	// Filter
	LongColumn moy = longColumn("d_moy");
	LongColumn year = longColumn("d_year");
	FilterPredicate moyPred = eq(moy, 11L);
	FilterPredicate yearPred = eq(year, 1999L);
	FilterPredicate constraint = and(moyPred, yearPred);
	ParquetThriftInputFormat.setFilterPredicate(job.getConfiguration(), constraint);

	ParquetThriftInputFormat.addInputPath(job, new Path(datadimPath));

	DataSet<Tuple2<Void, DateDimTable>> data = env.createInput(hadoopInputFormat);

	return data;
}

Source File: TPCDSQuery55Parquet.java From parquet-flinktacular with Apache License 2.0

6 votes

private static DataSet<Tuple2<Void, ItemTable>> getItemDataSet(ExecutionEnvironment env) throws IOException {
	Job job = Job.getInstance();

	//Schema projection
	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "i_item_sk;i_brand_id;i_brand;i_manager_id");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		ItemTable.class, job);
	ParquetThriftInputFormat.addInputPath(job, new Path(itemPath));

	//Filter		
	LongColumn managerId = longColumn("i_manager_id");
	FilterPredicate managerPred = eq(managerId, 28L);
	ParquetThriftInputFormat.setFilterPredicate(job.getConfiguration(), managerPred);

	DataSet<Tuple2<Void, ItemTable>> data = env.createInput(hadoopInputFormat);

	return data;
}

Source File: ParquetThriftExample.java From parquet-flinktacular with Apache License 2.0

6 votes

public static DataSet<Tuple2<Void, Person>> readThrift(ExecutionEnvironment env, String inputPath) throws 
	IOException {
	Job job = Job.getInstance();

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, Person
		.class, job);

	// schema projection: don't read attributes id and email
	job.getConfiguration().set("parquet.thrift.column.filter", "name;id;email;phone/number");

	FileInputFormat.addInputPath(job, new Path(inputPath));

	// push down predicates: get all persons with name = "Felix"
	BinaryColumn name = binaryColumn("name");
	FilterPredicate namePred = eq(name, Binary.fromString("Felix"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred);

	DataSet<Tuple2<Void, Person>> data = env.createInput(hadoopInputFormat);

	return data;
}

Source File: WordCount.java From Flink-CEPplus with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	Job job = Job.getInstance();
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
	TextInputFormat.addInputPath(job, new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	// Tokenize the line and convert from Writable "Text" to String for better handling
	DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

	// Sum up the words
	DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

	// Convert String back to Writable "Text" for use with Hadoop Output Format
	DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
	hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
	hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
	TextOutputFormat.setOutputPath(job, new Path(outputPath));

	// Output & Execute
	hadoopResult.output(hadoopOutputFormat);
	env.execute("Word Count");
}

Source File: WordCount.java From flink with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	Job job = Job.getInstance();
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
	TextInputFormat.addInputPath(job, new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	// Tokenize the line and convert from Writable "Text" to String for better handling
	DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

	// Sum up the words
	DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

	// Convert String back to Writable "Text" for use with Hadoop Output Format
	DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
	hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
	hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
	TextOutputFormat.setOutputPath(job, new Path(outputPath));

	// Output & Execute
	hadoopResult.output(hadoopOutputFormat);
	env.execute("Word Count");
}

Source File: ParquetProtobufExample.java From parquet-flinktacular with Apache License 2.0

5 votes

public static DataSet<Tuple2<Void, Person.Builder>> readProtobuf(ExecutionEnvironment env, String inputPath) 
	throws IOException {
	Job job = Job.getInstance();

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ProtoParquetInputFormat(), Void.class, Person
		.Builder.class, job);

	FileInputFormat.addInputPath(job, new Path(inputPath));

	//native predicate push down: read only records which satisfy a given constraint
	BinaryColumn name = binaryColumn("name");
	FilterPredicate namePred = eq(name, Binary.fromString("Felix"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred);

	//schema projection: don't read type of phone type attribute
	String projection = "message Person {\n" +
		"  required binary name (UTF8);\n" +
		"  required int32 id;\n" +
		"  optional binary email (UTF8);\n" +
		"  repeated group phone {\n" +
		"    required binary number (UTF8);\n" +
		"  }\n" +
		"}";
	ProtoParquetInputFormat.setRequestedProjection(job, projection);

	DataSet<Tuple2<Void, Person.Builder>> data = env.createInput(hadoopInputFormat);

	return data;
}

Source File: TPCDSQuery55Parquet.java From parquet-flinktacular with Apache License 2.0

5 votes

private static DataSet<Tuple2<Void, StoreSalesTable>> getStoreSalesDataSet(ExecutionEnvironment env) throws 
	IOException {
	Job job = Job.getInstance();

	//Schema projection		
	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "ss_sold_date_sk;ss_item_sk;ss_ext_sales_price");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		StoreSalesTable.class, job);
	ParquetThriftInputFormat.addInputPath(job, new Path(storesalesPath));
	DataSet<Tuple2<Void, StoreSalesTable>> data = env.createInput(hadoopInputFormat);

	return data;
}

Source File: WordCount.java From flink with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	Job job = Job.getInstance();
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
	TextInputFormat.addInputPath(job, new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	// Tokenize the line and convert from Writable "Text" to String for better handling
	DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

	// Sum up the words
	DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

	// Convert String back to Writable "Text" for use with Hadoop Output Format
	DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
	hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
	hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
	TextOutputFormat.setOutputPath(job, new Path(outputPath));

	// Output & Execute
	hadoopResult.output(hadoopOutputFormat);
	env.execute("Word Count");
}

Source File: ParquetAvroExample.java From parquet-flinktacular with Apache License 2.0

4 votes

public static DataSet<Tuple2<Void, Person>> readAvro(ExecutionEnvironment env, String inputPath) throws
	IOException {
	Job job = Job.getInstance();

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new AvroParquetInputFormat(), Void.class, Person
		.class, job);

	FileInputFormat.addInputPath(job, new Path(inputPath));

	// schema projection: don't read type of phonenumber     
	Schema phone = Schema.createRecord("PhoneNumber", null, null, false);
	phone.setFields(Arrays.asList(
		new Schema.Field("number", Schema.create(Schema.Type.BYTES), null, null)));

	Schema array = Schema.createArray(phone);
	Schema union = Schema.createUnion(Lists.newArrayList(Schema.create(Schema.Type.BYTES), Schema.create(Schema
		.Type
		.NULL)));


	Schema projection = Schema.createRecord("Person", null, null, false);
	projection.setFields(
		Arrays.asList(
			new Schema.Field("name", Schema.create(Schema.Type.BYTES), null, null),
			new Schema.Field("id", Schema.create(Schema.Type.INT), null, null),
			new Schema.Field("email", union, null, null),
			new Schema.Field("phone", array, null, null)
		)
	);

	AvroParquetInputFormat.setRequestedProjection(job, projection);

	// push down predicates: get all persons with name = "Felix"
	BinaryColumn name = binaryColumn("name");
	FilterPredicate namePred = eq(name, Binary.fromString("Felix"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred);

	DataSet<Tuple2<Void, Person>> data = env.createInput(hadoopInputFormat);

	return data;
}

Source File: TPCHQuery3Parquet.java From parquet-flinktacular with Apache License 2.0

4 votes

private static DataSet<Tuple2<Void, LineitemTable>> getLineitemDataSet(ExecutionEnvironment env) throws 
	IOException {
	Job job = Job.getInstance();

	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "ORDERKEY;EXTENDEDPRICE;DISCOUNT;SHIPDATE");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		LineitemTable.class, job);

	// Filter all Lineitems with l_shipdate > 12.03.1995
	ParquetThriftInputFormat.setUnboundRecordFilter(job, LineitemFilter.class);

	ParquetThriftInputFormat.addInputPath(job, new Path(lineitemPath));

	DataSet<Tuple2<Void, LineitemTable>> data = env.createInput(hadoopInputFormat);

	return data;
}

Source File: TPCHQuery3Parquet.java From parquet-flinktacular with Apache License 2.0

4 votes

private static DataSet<Tuple2<Void, OrderTable>> getOrdersDataSet(ExecutionEnvironment env) throws IOException {
	Job job = Job.getInstance();

	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "ID;CUSTKEY;ORDERDATE;SHIP_PRIORITY");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		OrderTable.class, job);

	ParquetThriftInputFormat.addInputPath(job, new Path(ordersPath));

	// Filter all Orders with o_orderdate < 12.03.1995
	ParquetThriftInputFormat.setUnboundRecordFilter(job, OrderFilter.class);


	DataSet<Tuple2<Void, OrderTable>> data = env.createInput(hadoopInputFormat);

	return data;
}

org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat Java Examples