Java Code Examples for cascading.tap.hadoop.Hfs

The following examples show how to use cascading.tap.hadoop.Hfs. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: cascading-flink   Source File: FlinkFlowProcess.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public TupleEntryCollector openTrapForWrite(Tap trap) throws IOException {

	if (trap instanceof Hfs) {

		JobConf jobConf = new JobConf(this.getConfigCopy());

		int stepNum = jobConf.getInt( "cascading.flow.step.num", 0 );
		int nodeNum = jobConf.getInt( "cascading.flow.node.num", 0 );

		String partname = String.format( "-%05d-%05d-%05d", stepNum, nodeNum, this.getCurrentSliceNum() );
		jobConf.set( "cascading.tapcollector.partname", "%s%spart" + partname );

		String value = String.format( "attempt_%012d_0000_m_%06d_0", (int) Math.rint( System.currentTimeMillis() ), this.getCurrentSliceNum() );
		jobConf.set( "mapred.task.id", value );
		jobConf.set( "mapreduce.task.id", value );

		return trap.openForWrite( new FlinkFlowProcess( jobConf ), null);
	}
	else {
		throw new UnsupportedOperationException("Only Hfs taps are supported as traps");
	}
}
 
Example 2
Source Project: cascading-flink   Source File: WordCount.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {

		if (args.length < 2) {
			throw new IllegalArgumentException("Please specify input and ouput paths as arguments.");
		}

		Fields token = new Fields( "token", String.class );
		Fields text = new Fields( "text" );
		RegexSplitGenerator splitter = new RegexSplitGenerator( token, "\\s+" );
		// only returns "token"
		Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

		Pipe wcPipe = new Pipe( "wc", docPipe );
		wcPipe = new AggregateBy( wcPipe, token, new CountBy(new Fields("count")));

		Tap inTap = new Hfs(new TextDelimited(text, "\n" ), args[0]);
		Tap outTap = new Hfs(new TextDelimited(false, "\n"), args[1], SinkMode.REPLACE);

		FlowDef flowDef = FlowDef.flowDef().setName( "wc" )
				.addSource( docPipe, inTap )
				.addTailSink( wcPipe, outTap );

		FlowConnector flowConnector = new FlinkConnector();

		Flow wcFlow = flowConnector.connect( flowDef );

		wcFlow.complete();
	}
 
Example 3
Source Project: cascading-flink   Source File: FlinkTestPlatform.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Tap getTextFile(Fields sourceFields, Fields sinkFields, String filename, SinkMode mode) {
	if( sourceFields == null ) {
			return new Hfs(new TextLine(), filename, mode);
	}

	return new Hfs( new TextLine( sourceFields, sinkFields ), filename, mode );
}
 
Example 4
Source Project: cascading-flink   Source File: FlinkTestPlatform.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Tap getTap(Scheme scheme, String filename, SinkMode mode) {
	return new Hfs(scheme, filename, mode);
}
 
Example 5
Source Project: cascading-flink   Source File: FlinkTestPlatform.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Tap getDelimitedFile(Fields fields, boolean hasHeader, String delimiter, String quote,
							Class[] types, String filename, SinkMode mode) {
	return new Hfs( new TextDelimited( fields, hasHeader, delimiter, quote, types ), filename, mode );
}
 
Example 6
Source Project: cascading-flink   Source File: FlinkTestPlatform.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Tap getDelimitedFile(Fields fields, boolean skipHeader, boolean writeHeader, String delimiter,
							String quote, Class[] types, String filename, SinkMode mode) {
	return new Hfs( new TextDelimited( fields, skipHeader, writeHeader, delimiter, quote, types ), filename, mode );
}
 
Example 7
Source Project: cascading-flink   Source File: FlinkTestPlatform.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Tap getDelimitedFile(String delimiter, String quote, FieldTypeResolver fieldTypeResolver, String filename, SinkMode mode) {
	return new Hfs( new TextDelimited( true, new DelimitedParser( delimiter, quote, fieldTypeResolver ) ), filename, mode );
}
 
Example 8
Source Project: cascading-flink   Source File: FlinkTestPlatform.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Tap getPartitionTap(Tap sink, Partition partition, int openThreshold) {
	return new PartitionTap( (Hfs) sink, partition, openThreshold );
}
 
Example 9
public static void main(String[] args) {
  String fooInputPath = args[0];
  String barInputPath = args[1];
  String outputPath = args[2];
  int fooValMax = Integer.parseInt(args[3]);
  int joinValMax = Integer.parseInt(args[4]);
  int numberOfReducers = Integer.parseInt(args[5]);

  Properties properties = new Properties();
  AppProps.setApplicationJarClass(properties,
      JoinFilterExampleCascading.class);
  properties.setProperty("mapred.reduce.tasks", Integer.toString(numberOfReducers));
  properties.setProperty("mapreduce.job.reduces", Integer.toString(numberOfReducers));
  
  SpillableProps props = SpillableProps.spillableProps()
      .setCompressSpill( true )
      .setMapSpillThreshold( 50 * 1000 );
      

  
  HadoopFlowConnector flowConnector = new HadoopFlowConnector(properties);

  // create source and sink taps
  Fields fooFields = new Fields("fooId", "fooVal", "foobarId");
  Tap fooTap = new Hfs(new TextDelimited(fooFields, "|"), fooInputPath);
  Fields barFields = new Fields("barId", "barVal");
  Tap barTap = new Hfs(new TextDelimited(barFields, "|"), barInputPath);

  Tap outputTap = new Hfs(new TextDelimited(false, "|"), outputPath);

  Fields joinFooFields = new Fields("foobarId");
  Fields joinBarFields = new Fields("barId");

  Pipe fooPipe = new Pipe("fooPipe");
  Pipe barPipe = new Pipe("barPipe");

  Pipe fooFiltered = new Each(fooPipe, fooFields, new FooFilter(fooValMax));

  Pipe joinedPipe = new HashJoin(fooFiltered, joinFooFields, barPipe,
      joinBarFields);
  props.setProperties( joinedPipe.getConfigDef(), Mode.REPLACE );
  
  
  Fields joinFields = new Fields("fooId", "fooVal", "foobarId", "barVal");
  Pipe joinedFilteredPipe = new Each(joinedPipe, joinFields,
      new JoinedFilter(joinValMax));

  FlowDef flowDef = FlowDef.flowDef().setName("wc")
      .addSource(fooPipe, fooTap).addSource(barPipe, barTap)
      .addTailSink(joinedFilteredPipe, outputTap);

  Flow wcFlow = flowConnector.connect(flowDef);
  wcFlow.writeDOT("dot/wc.dot");
  wcFlow.complete();
}
 
Example 10
Source Project: parquet-mr   Source File: ParquetTupleScheme.java    License: Apache License 2.0 4 votes vote down vote up
private List<Footer> getFooters(FlowProcess<JobConf> flowProcess, Hfs hfs) throws IOException {
  JobConf jobConf = flowProcess.getConfigCopy();
  DeprecatedParquetInputFormat format = new DeprecatedParquetInputFormat();
  format.addInputPath(jobConf, hfs.getPath());
  return format.getFooters(jobConf);
}
 
Example 11
Source Project: parquet-mr   Source File: ParquetTupleScheme.java    License: Apache License 2.0 4 votes vote down vote up
private List<Footer> getFooters(FlowProcess<? extends JobConf> flowProcess, Hfs hfs) throws IOException {
  JobConf jobConf = flowProcess.getConfigCopy();
  DeprecatedParquetInputFormat format = new DeprecatedParquetInputFormat();
  format.addInputPath(jobConf, hfs.getPath());
  return format.getFooters(jobConf);
}