org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat Java Examples
The following examples show how to use
org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TPCHQuery3Parquet.java From parquet-flinktacular with Apache License 2.0 | 6 votes |
private static DataSet<Tuple2<Void, CustomerTable>> getCustomerDataSet(ExecutionEnvironment env) throws IOException { Job job = Job.getInstance(); ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class); job.getConfiguration().set("parquet.thrift.column.filter", "ID;MKTSEGMENT"); HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, CustomerTable.class, job); // Filter market segment "AUTOMOBILE" BinaryColumn mktsegment = binaryColumn("MKTSEGMENT"); FilterPredicate mktsegmentPred = eq(mktsegment, Binary.fromString("AUTOMOBILE")); ParquetInputFormat.setFilterPredicate(job.getConfiguration(), mktsegmentPred); ParquetThriftInputFormat.addInputPath(job, new Path(customerPath)); DataSet<Tuple2<Void, CustomerTable>> data = env.createInput(hadoopInputFormat); return data; }
Example #2
Source File: TPCDSQuery55Parquet.java From parquet-flinktacular with Apache License 2.0 | 6 votes |
private static DataSet<Tuple2<Void, DateDimTable>> getDataDimDataSet(ExecutionEnvironment env) throws IOException { Job job = Job.getInstance(); //Schema projection ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class); job.getConfiguration().set("parquet.thrift.column.filter", "d_date_sk;d_year;d_moy"); HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, DateDimTable.class, job); // Filter LongColumn moy = longColumn("d_moy"); LongColumn year = longColumn("d_year"); FilterPredicate moyPred = eq(moy, 11L); FilterPredicate yearPred = eq(year, 1999L); FilterPredicate constraint = and(moyPred, yearPred); ParquetThriftInputFormat.setFilterPredicate(job.getConfiguration(), constraint); ParquetThriftInputFormat.addInputPath(job, new Path(datadimPath)); DataSet<Tuple2<Void, DateDimTable>> data = env.createInput(hadoopInputFormat); return data; }
Example #3
Source File: TPCDSQuery55Parquet.java From parquet-flinktacular with Apache License 2.0 | 6 votes |
private static DataSet<Tuple2<Void, ItemTable>> getItemDataSet(ExecutionEnvironment env) throws IOException { Job job = Job.getInstance(); //Schema projection ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class); job.getConfiguration().set("parquet.thrift.column.filter", "i_item_sk;i_brand_id;i_brand;i_manager_id"); HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, ItemTable.class, job); ParquetThriftInputFormat.addInputPath(job, new Path(itemPath)); //Filter LongColumn managerId = longColumn("i_manager_id"); FilterPredicate managerPred = eq(managerId, 28L); ParquetThriftInputFormat.setFilterPredicate(job.getConfiguration(), managerPred); DataSet<Tuple2<Void, ItemTable>> data = env.createInput(hadoopInputFormat); return data; }
Example #4
Source File: ParquetThriftExample.java From parquet-flinktacular with Apache License 2.0 | 6 votes |
public static DataSet<Tuple2<Void, Person>> readThrift(ExecutionEnvironment env, String inputPath) throws IOException { Job job = Job.getInstance(); HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, Person .class, job); // schema projection: don't read attributes id and email job.getConfiguration().set("parquet.thrift.column.filter", "name;id;email;phone/number"); FileInputFormat.addInputPath(job, new Path(inputPath)); // push down predicates: get all persons with name = "Felix" BinaryColumn name = binaryColumn("name"); FilterPredicate namePred = eq(name, Binary.fromString("Felix")); ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred); DataSet<Tuple2<Void, Person>> data = env.createInput(hadoopInputFormat); return data; }
Example #5
Source File: WordCount.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format Job job = Job.getInstance(); HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job); TextInputFormat.addInputPath(job, new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); // Tokenize the line and convert from Writable "Text" to String for better handling DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer()); // Sum up the words DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1); // Convert String back to Writable "Text" for use with Hadoop Output Format DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper()); // Set up Hadoop Output Format HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job); hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " "); hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test TextOutputFormat.setOutputPath(job, new Path(outputPath)); // Output & Execute hadoopResult.output(hadoopOutputFormat); env.execute("Word Count"); }
Example #6
Source File: WordCount.java From flink with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format Job job = Job.getInstance(); HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job); TextInputFormat.addInputPath(job, new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); // Tokenize the line and convert from Writable "Text" to String for better handling DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer()); // Sum up the words DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1); // Convert String back to Writable "Text" for use with Hadoop Output Format DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper()); // Set up Hadoop Output Format HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job); hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " "); hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test TextOutputFormat.setOutputPath(job, new Path(outputPath)); // Output & Execute hadoopResult.output(hadoopOutputFormat); env.execute("Word Count"); }
Example #7
Source File: ParquetProtobufExample.java From parquet-flinktacular with Apache License 2.0 | 5 votes |
public static DataSet<Tuple2<Void, Person.Builder>> readProtobuf(ExecutionEnvironment env, String inputPath) throws IOException { Job job = Job.getInstance(); HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ProtoParquetInputFormat(), Void.class, Person .Builder.class, job); FileInputFormat.addInputPath(job, new Path(inputPath)); //native predicate push down: read only records which satisfy a given constraint BinaryColumn name = binaryColumn("name"); FilterPredicate namePred = eq(name, Binary.fromString("Felix")); ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred); //schema projection: don't read type of phone type attribute String projection = "message Person {\n" + " required binary name (UTF8);\n" + " required int32 id;\n" + " optional binary email (UTF8);\n" + " repeated group phone {\n" + " required binary number (UTF8);\n" + " }\n" + "}"; ProtoParquetInputFormat.setRequestedProjection(job, projection); DataSet<Tuple2<Void, Person.Builder>> data = env.createInput(hadoopInputFormat); return data; }
Example #8
Source File: TPCDSQuery55Parquet.java From parquet-flinktacular with Apache License 2.0 | 5 votes |
private static DataSet<Tuple2<Void, StoreSalesTable>> getStoreSalesDataSet(ExecutionEnvironment env) throws IOException { Job job = Job.getInstance(); //Schema projection ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class); job.getConfiguration().set("parquet.thrift.column.filter", "ss_sold_date_sk;ss_item_sk;ss_ext_sales_price"); HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, StoreSalesTable.class, job); ParquetThriftInputFormat.addInputPath(job, new Path(storesalesPath)); DataSet<Tuple2<Void, StoreSalesTable>> data = env.createInput(hadoopInputFormat); return data; }
Example #9
Source File: WordCount.java From flink with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format Job job = Job.getInstance(); HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job); TextInputFormat.addInputPath(job, new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); // Tokenize the line and convert from Writable "Text" to String for better handling DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer()); // Sum up the words DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1); // Convert String back to Writable "Text" for use with Hadoop Output Format DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper()); // Set up Hadoop Output Format HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job); hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " "); hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test TextOutputFormat.setOutputPath(job, new Path(outputPath)); // Output & Execute hadoopResult.output(hadoopOutputFormat); env.execute("Word Count"); }
Example #10
Source File: ParquetAvroExample.java From parquet-flinktacular with Apache License 2.0 | 4 votes |
public static DataSet<Tuple2<Void, Person>> readAvro(ExecutionEnvironment env, String inputPath) throws IOException { Job job = Job.getInstance(); HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new AvroParquetInputFormat(), Void.class, Person .class, job); FileInputFormat.addInputPath(job, new Path(inputPath)); // schema projection: don't read type of phonenumber Schema phone = Schema.createRecord("PhoneNumber", null, null, false); phone.setFields(Arrays.asList( new Schema.Field("number", Schema.create(Schema.Type.BYTES), null, null))); Schema array = Schema.createArray(phone); Schema union = Schema.createUnion(Lists.newArrayList(Schema.create(Schema.Type.BYTES), Schema.create(Schema .Type .NULL))); Schema projection = Schema.createRecord("Person", null, null, false); projection.setFields( Arrays.asList( new Schema.Field("name", Schema.create(Schema.Type.BYTES), null, null), new Schema.Field("id", Schema.create(Schema.Type.INT), null, null), new Schema.Field("email", union, null, null), new Schema.Field("phone", array, null, null) ) ); AvroParquetInputFormat.setRequestedProjection(job, projection); // push down predicates: get all persons with name = "Felix" BinaryColumn name = binaryColumn("name"); FilterPredicate namePred = eq(name, Binary.fromString("Felix")); ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred); DataSet<Tuple2<Void, Person>> data = env.createInput(hadoopInputFormat); return data; }
Example #11
Source File: TPCHQuery3Parquet.java From parquet-flinktacular with Apache License 2.0 | 4 votes |
private static DataSet<Tuple2<Void, LineitemTable>> getLineitemDataSet(ExecutionEnvironment env) throws IOException { Job job = Job.getInstance(); ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class); job.getConfiguration().set("parquet.thrift.column.filter", "ORDERKEY;EXTENDEDPRICE;DISCOUNT;SHIPDATE"); HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, LineitemTable.class, job); // Filter all Lineitems with l_shipdate > 12.03.1995 ParquetThriftInputFormat.setUnboundRecordFilter(job, LineitemFilter.class); ParquetThriftInputFormat.addInputPath(job, new Path(lineitemPath)); DataSet<Tuple2<Void, LineitemTable>> data = env.createInput(hadoopInputFormat); return data; }
Example #12
Source File: TPCHQuery3Parquet.java From parquet-flinktacular with Apache License 2.0 | 4 votes |
private static DataSet<Tuple2<Void, OrderTable>> getOrdersDataSet(ExecutionEnvironment env) throws IOException { Job job = Job.getInstance(); ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class); job.getConfiguration().set("parquet.thrift.column.filter", "ID;CUSTKEY;ORDERDATE;SHIP_PRIORITY"); HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, OrderTable.class, job); ParquetThriftInputFormat.addInputPath(job, new Path(ordersPath)); // Filter all Orders with o_orderdate < 12.03.1995 ParquetThriftInputFormat.setUnboundRecordFilter(job, OrderFilter.class); DataSet<Tuple2<Void, OrderTable>> data = env.createInput(hadoopInputFormat); return data; }