Java Code Examples for org.apache.flink.api.java.ExecutionEnvironment#createInput()

The following examples show how to use org.apache.flink.api.java.ExecutionEnvironment#createInput() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AvroTypeExtractionTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testKeySelection() throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.getConfig().enableObjectReuse();
	Path in = new Path(inFile.getAbsoluteFile().toURI());

	AvroInputFormat<User> users = new AvroInputFormat<>(in, User.class);
	DataSet<User> usersDS = env.createInput(users);

	DataSet<Tuple2<String, Integer>> res = usersDS
		.groupBy("name")
		.reduceGroup((GroupReduceFunction<User, Tuple2<String, Integer>>) (values, out) -> {
			for (User u : values) {
				out.collect(new Tuple2<>(u.getName().toString(), 1));
			}
		})
		.returns(Types.TUPLE(Types.STRING, Types.INT));
	res.writeAsText(resultPath);
	env.execute("Avro Key selection");

	expected = "(Alyssa,1)\n(Charlie,1)\n";
}
 
Example 2
Source File: TPCDSQuery55Parquet.java    From parquet-flinktacular with Apache License 2.0 6 votes vote down vote up
private static DataSet<Tuple2<Void, DateDimTable>> getDataDimDataSet(ExecutionEnvironment env) throws IOException {
	Job job = Job.getInstance();

	//Schema projection
	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "d_date_sk;d_year;d_moy");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		DateDimTable.class, job);

	// Filter
	LongColumn moy = longColumn("d_moy");
	LongColumn year = longColumn("d_year");
	FilterPredicate moyPred = eq(moy, 11L);
	FilterPredicate yearPred = eq(year, 1999L);
	FilterPredicate constraint = and(moyPred, yearPred);
	ParquetThriftInputFormat.setFilterPredicate(job.getConfiguration(), constraint);

	ParquetThriftInputFormat.addInputPath(job, new Path(datadimPath));

	DataSet<Tuple2<Void, DateDimTable>> data = env.createInput(hadoopInputFormat);

	return data;
}
 
Example 3
Source File: ParquetThriftExample.java    From parquet-flinktacular with Apache License 2.0 6 votes vote down vote up
public static DataSet<Tuple2<Void, Person>> readThrift(ExecutionEnvironment env, String inputPath) throws 
	IOException {
	Job job = Job.getInstance();

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, Person
		.class, job);

	// schema projection: don't read attributes id and email
	job.getConfiguration().set("parquet.thrift.column.filter", "name;id;email;phone/number");

	FileInputFormat.addInputPath(job, new Path(inputPath));

	// push down predicates: get all persons with name = "Felix"
	BinaryColumn name = binaryColumn("name");
	FilterPredicate namePred = eq(name, Binary.fromString("Felix"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred);

	DataSet<Tuple2<Void, Person>> data = env.createInput(hadoopInputFormat);

	return data;
}
 
Example 4
Source File: AvroTypeExtractionTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testWithKryoGenericSer() throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.getConfig().enableForceKryo();
	Path in = new Path(inFile.getAbsoluteFile().toURI());

	AvroInputFormat<User> users = new AvroInputFormat<>(in, User.class);
	DataSet<User> usersDS = env.createInput(users);

	DataSet<Tuple2<String, Integer>> res = usersDS
		.groupBy((KeySelector<User, String>) value -> String.valueOf(value.getName()))
		.reduceGroup((GroupReduceFunction<User, Tuple2<String, Integer>>) (values, out) -> {
			for (User u : values) {
				out.collect(new Tuple2<>(u.getName().toString(), 1));
			}
		})
		.returns(Types.TUPLE(Types.STRING, Types.INT));

	res.writeAsText(resultPath);
	env.execute("Avro Key selection");

	expected = "(Charlie,1)\n(Alyssa,1)\n";
}
 
Example 5
Source File: ReplicatingDataSourceTest.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
/**
 * Tests compiler fail for join program with replicated data source behind rebalance.
 */
@Test(expected = CompilerException.class)
public void checkJoinWithReplicatedSourceInputBehindRebalance() {
	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.rebalance()
			.join(source2).where("*").equalTo("*")
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);
}
 
Example 6
Source File: FlinkUtil.java    From kylin with Apache License 2.0 6 votes vote down vote up
public static DataSet parseInputPath(String inputPath, FileSystem fs, ExecutionEnvironment env, Class keyClass,
        Class valueClass) throws IOException {
    List<String> inputFolders = Lists.newArrayList();
    Path inputHDFSPath = new Path(inputPath);
    FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath);
    boolean hasDir = false;
    for (FileStatus stat : fileStatuses) {
        if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
            hasDir = true;
            inputFolders.add(stat.getPath().toString());
        }
    }

    if (!hasDir) {
        return env.createInput(HadoopInputs.readSequenceFile(keyClass, valueClass, inputHDFSPath.toString()));
    }

    Job job = Job.getInstance();
    FileInputFormat.setInputPaths(job, StringUtil.join(inputFolders, ","));
    return env.createInput(HadoopInputs.createHadoopInput(new SequenceFileInputFormat(), keyClass, valueClass, job));
}
 
Example 7
Source File: ReplicatingDataSourceTest.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Tests compiler fail for join program with replicated data source and changing parallelism.
 */
@Test(expected = CompilerException.class)
public void checkJoinWithReplicatedSourceInputChangingparallelism() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.join(source2).where("*").equalTo("*").setParallelism(DEFAULT_PARALLELISM+2)
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);
}
 
Example 8
Source File: ReplicatingDataSourceTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Tests join program with replicated data source behind flatMap.
 */
@Test
public void checkJoinWithReplicatedSourceInputBehindFlatMap() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.flatMap(new IdFlatMap())
			.join(source2).where("*").equalTo("*")
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	// when join should have forward strategy on both sides
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	DualInputPlanNode joinNode = (DualInputPlanNode) sinkNode.getPredecessor();

	ShipStrategyType joinIn1 = joinNode.getInput1().getShipStrategy();
	ShipStrategyType joinIn2 = joinNode.getInput2().getShipStrategy();

	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn1);
	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn2);
}
 
Example 9
Source File: JoinCancelingITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private void executeTask(JoinFunction<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>> joiner, boolean slow, int parallelism) throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSet<Tuple2<Integer, Integer>> input1 = env.createInput(new InfiniteIntegerTupleInputFormat(slow));
	DataSet<Tuple2<Integer, Integer>> input2 = env.createInput(new InfiniteIntegerTupleInputFormat(slow));

	input1.join(input2, JoinOperatorBase.JoinHint.REPARTITION_SORT_MERGE)
			.where(0)
			.equalTo(0)
			.with(joiner)
			.output(new DiscardingOutputFormat<Tuple2<Integer, Integer>>());

	env.setParallelism(parallelism);

	runAndCancelJob(env.createProgramPlan(), 5 * 1000, 10 * 1000);
}
 
Example 10
Source File: ReplicatingDataSourceTest.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * Tests join program with replicated data source behind map.
 */
@Test
public void checkJoinWithReplicatedSourceInputBehindMap() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.map(new IdMap())
			.join(source2).where("*").equalTo("*")
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	// when join should have forward strategy on both sides
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	DualInputPlanNode joinNode = (DualInputPlanNode) sinkNode.getPredecessor();

	ShipStrategyType joinIn1 = joinNode.getInput1().getShipStrategy();
	ShipStrategyType joinIn2 = joinNode.getInput2().getShipStrategy();

	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn1);
	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn2);
}
 
Example 11
Source File: ReplicatingDataSourceTest.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * Tests cross program with replicated data source.
 */
@Test
public void checkCrossWithReplicatedSourceInput() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.cross(source2)
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	// when cross should have forward strategy on both sides
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	DualInputPlanNode crossNode = (DualInputPlanNode) sinkNode.getPredecessor();

	ShipStrategyType crossIn1 = crossNode.getInput1().getShipStrategy();
	ShipStrategyType crossIn2 = crossNode.getInput2().getShipStrategy();

	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, crossIn1);
	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, crossIn2);
}
 
Example 12
Source File: ReplicatingDataSourceTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Tests join program with replicated data source behind map partition.
 */
@Test
public void checkJoinWithReplicatedSourceInputBehindMapPartition() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.mapPartition(new IdPMap())
			.join(source2).where("*").equalTo("*")
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	// when join should have forward strategy on both sides
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	DualInputPlanNode joinNode = (DualInputPlanNode) sinkNode.getPredecessor();

	ShipStrategyType joinIn1 = joinNode.getInput1().getShipStrategy();
	ShipStrategyType joinIn2 = joinNode.getInput2().getShipStrategy();

	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn1);
	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn2);
}
 
Example 13
Source File: JoinCancelingITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private void executeTask(JoinFunction<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>> joiner, boolean slow, int parallelism) throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSet<Tuple2<Integer, Integer>> input1 = env.createInput(new InfiniteIntegerTupleInputFormat(slow));
	DataSet<Tuple2<Integer, Integer>> input2 = env.createInput(new InfiniteIntegerTupleInputFormat(slow));

	input1.join(input2, JoinOperatorBase.JoinHint.REPARTITION_SORT_MERGE)
			.where(0)
			.equalTo(0)
			.with(joiner)
			.output(new DiscardingOutputFormat<Tuple2<Integer, Integer>>());

	env.setParallelism(parallelism);

	runAndCancelJob(env.createProgramPlan(), 5 * 1000, 10 * 1000);
}
 
Example 14
Source File: BatchPojoExample.java    From flink-learning with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        List<CustomCassandraAnnotatedPojo> customCassandraAnnotatedPojos = IntStream.range(0, 20)
                .mapToObj(x -> new CustomCassandraAnnotatedPojo(UUID.randomUUID().toString(), x, 0))
                .collect(Collectors.toList());

        DataSet<CustomCassandraAnnotatedPojo> dataSet = env.fromCollection(customCassandraAnnotatedPojos);

        ClusterBuilder clusterBuilder = new ClusterBuilder() {
            private static final long serialVersionUID = -1754532803757154795L;

            @Override
            protected Cluster buildCluster(Cluster.Builder builder) {
                return builder.addContactPoints("127.0.0.1").build();
            }
        };

        dataSet.output(new CassandraPojoOutputFormat<>(clusterBuilder, CustomCassandraAnnotatedPojo.class, () -> new Mapper.Option[]{Mapper.Option.saveNullFields(true)}));

        env.execute("zhisheng");

        /*
         *	This is for the purpose of showing an example of creating a DataSet using CassandraPojoInputFormat.
         */
        DataSet<CustomCassandraAnnotatedPojo> inputDS = env
                .createInput(new CassandraPojoInputFormat<>(
                        SELECT_QUERY,
                        clusterBuilder,
                        CustomCassandraAnnotatedPojo.class,
                        () -> new Mapper.Option[]{Mapper.Option.consistencyLevel(ConsistencyLevel.ANY)}
                ));

        inputDS.print();
    }
 
Example 15
Source File: AvroTypeExtractionTest.java    From flink with Apache License 2.0 4 votes vote down vote up
private void testField(final String fieldName) throws Exception {
	before();

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	Path in = new Path(inFile.getAbsoluteFile().toURI());

	AvroInputFormat<User> users = new AvroInputFormat<>(in, User.class);
	DataSet<User> usersDS = env.createInput(users);

	DataSet<Object> res = usersDS
		.groupBy(fieldName)
		.reduceGroup((GroupReduceFunction<User, Object>) (values, out) -> {
			for (User u : values) {
				out.collect(u.get(fieldName));
			}
		})
		.returns(Object.class);
	res.writeAsText(resultPath);
	env.execute("Simple Avro read job");

	// test if automatic registration of the Types worked
	ExecutionConfig ec = env.getConfig();
	Assert.assertTrue(ec.getRegisteredKryoTypes().contains(Fixed16.class));

	switch (fieldName) {
		case "name":
			expected = "Alyssa\nCharlie";
			break;
		case "type_enum":
			expected = "GREEN\nRED\n";
			break;
		case "type_double_test":
			expected = "123.45\n1.337\n";
			break;
		default:
			Assert.fail("Unknown field");
			break;
	}

	after();
}
 
Example 16
Source File: FlinkPravegaInputFormatITCase.java    From flink-connectors with Apache License 2.0 4 votes vote down vote up
/**
 * Verifies that the input format:
 *  - correctly reads all records in a given set of multiple Pravega streams
 *  - allows multiple executions
 */
@Test
public void testBatchInput() throws Exception {
    final int numElements1 = 100;
    final int numElements2 = 300;

    // set up the stream
    final String streamName1 = RandomStringUtils.randomAlphabetic(20);
    final String streamName2 = RandomStringUtils.randomAlphabetic(20);

    final Set<String> streams = new HashSet<>();
    streams.add(streamName1);
    streams.add(streamName2);

    SETUP_UTILS.createTestStream(streamName1, 3);
    SETUP_UTILS.createTestStream(streamName2, 5);

    try (
            final EventStreamWriter<Integer> eventWriter1 = SETUP_UTILS.getIntegerWriter(streamName1);
            final EventStreamWriter<Integer> eventWriter2 = SETUP_UTILS.getIntegerWriter(streamName2);

            // create the producer that writes to the stream
            final ThrottledIntegerWriter producer1 = new ThrottledIntegerWriter(
                    eventWriter1,
                    numElements1,
                    numElements1 + 1, // no need to block writer for a batch test
                    0,
                    false
            );

            final ThrottledIntegerWriter producer2 = new ThrottledIntegerWriter(
                    eventWriter2,
                    numElements2,
                    numElements2 + 1, // no need to block writer for a batch test
                    0,
                    false
            )
    ) {
        // write batch input
        producer1.start();
        producer2.start();

        producer1.sync();
        producer2.sync();

        final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(3);

        // simple pipeline that reads from Pravega and collects the events
        DataSet<Integer> integers = env.createInput(
                FlinkPravegaInputFormat.<Integer>builder()
                        .forStream(streamName1)
                        .forStream(streamName2)
                        .withPravegaConfig(SETUP_UTILS.getPravegaConfig())
                        .withDeserializationSchema(new IntegerDeserializationSchema())
                        .build(),
                BasicTypeInfo.INT_TYPE_INFO
        );

        // verify that all events were read
        Assert.assertEquals(numElements1 + numElements2, integers.collect().size());

        // this verifies that the input format allows multiple passes
        Assert.assertEquals(numElements1 + numElements2, integers.collect().size());
    }
}
 
Example 17
Source File: TPCHQuery3Parquet.java    From parquet-flinktacular with Apache License 2.0 4 votes vote down vote up
private static DataSet<Tuple2<Void, OrderTable>> getOrdersDataSet(ExecutionEnvironment env) throws IOException {
	Job job = Job.getInstance();

	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "ID;CUSTKEY;ORDERDATE;SHIP_PRIORITY");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		OrderTable.class, job);

	ParquetThriftInputFormat.addInputPath(job, new Path(ordersPath));

	// Filter all Orders with o_orderdate < 12.03.1995
	ParquetThriftInputFormat.setUnboundRecordFilter(job, OrderFilter.class);


	DataSet<Tuple2<Void, OrderTable>> data = env.createInput(hadoopInputFormat);

	return data;
}
 
Example 18
Source File: TPCHQuery3Parquet.java    From parquet-flinktacular with Apache License 2.0 4 votes vote down vote up
private static DataSet<Tuple2<Void, LineitemTable>> getLineitemDataSet(ExecutionEnvironment env) throws 
	IOException {
	Job job = Job.getInstance();

	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "ORDERKEY;EXTENDEDPRICE;DISCOUNT;SHIPDATE");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		LineitemTable.class, job);

	// Filter all Lineitems with l_shipdate > 12.03.1995
	ParquetThriftInputFormat.setUnboundRecordFilter(job, LineitemFilter.class);

	ParquetThriftInputFormat.addInputPath(job, new Path(lineitemPath));

	DataSet<Tuple2<Void, LineitemTable>> data = env.createInput(hadoopInputFormat);

	return data;
}
 
Example 19
Source File: AvroExternalJarProgram.java    From Flink-CEPplus with Apache License 2.0 3 votes vote down vote up
public static void main(String[] args) throws Exception {
	String inputPath = args[0];

	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<MyUser> input = env.createInput(new AvroInputFormat<MyUser>(new Path(inputPath), MyUser.class));

	DataSet<Tuple2<String, MyUser>> result = input.map(new NameExtractor()).groupBy(0).reduce(new NameGrouper());

	result.output(new DiscardingOutputFormat<Tuple2<String, MyUser>>());
	env.execute();
}
 
Example 20
Source File: AvroExternalJarProgram.java    From flink with Apache License 2.0 3 votes vote down vote up
public static void main(String[] args) throws Exception {
	String inputPath = args[0];

	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<MyUser> input = env.createInput(new AvroInputFormat<MyUser>(new Path(inputPath), MyUser.class));

	DataSet<Tuple2<String, MyUser>> result = input.map(new NameExtractor()).groupBy(0).reduce(new NameGrouper());

	result.output(new DiscardingOutputFormat<Tuple2<String, MyUser>>());
	env.execute();
}