org.apache.hadoop.mapred.TextInputFormat Java Exaples

Source File: S3SelectPushdown.java From presto with Apache License 2.0

6 votes

private static boolean isInputFormatSupported(Properties schema)
{
    String inputFormat = getInputFormatName(schema);

    if (TextInputFormat.class.getName().equals(inputFormat)) {
        if (!Objects.equals(schema.getProperty(SKIP_HEADER_COUNT_KEY, "0"), "0")) {
            // S3 Select supports skipping one line of headers, but it was returning incorrect results for presto-hive-hadoop2/conf/files/test_table_with_header.csv.gz
            // TODO https://github.com/prestosql/presto/issues/2349
            return false;
        }
        if (!Objects.equals(schema.getProperty(SKIP_FOOTER_COUNT_KEY, "0"), "0")) {
            // S3 Select does not support skipping footers
            return false;
        }
        return true;
    }

    return false;
}

Source File: TestMultipleInputs.java From hadoop with Apache License 2.0

6 votes

public void testAddInputPathWithMapper() {
  final JobConf conf = new JobConf();
  MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
     MapClass.class);
  MultipleInputs.addInputPath(conf, new Path("/bar"),
     KeyValueTextInputFormat.class, MapClass2.class);
  final Map<Path, InputFormat> inputs = MultipleInputs
     .getInputFormatMap(conf);
  final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
     .getMapperTypeMap(conf);

  assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
  assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
     .getClass());
  assertEquals(MapClass.class, maps.get(new Path("/foo")));
  assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}

Source File: ReaderTextCSVParallel.java From systemds with Apache License 2.0

6 votes

public CSVReadTask(InputSplit split, SplitOffsetInfos offsets,
		TextInputFormat informat, JobConf job, MatrixBlock dest,
		long rlen, long clen, boolean hasHeader, String delim,
		boolean fill, double fillValue, int splitCount) 
{
	_split = split;
	_splitoffsets = offsets; // new SplitOffsetInfos(offsets);
	_sparse = dest.isInSparseFormat();
	_informat = informat;
	_job = job;
	_dest = dest;
	_rlen = rlen;
	_clen = clen;
	_isFirstSplit = (splitCount == 0);
	_hasHeader = hasHeader;
	_fill = fill;
	_fillValue = fillValue;
	_delim = delim;
	_rc = true;
	_splitCount = splitCount;
}

Source File: WordCount.java From attic-apex-malhar with Apache License 2.0

6 votes

public void run(String[] args) throws Exception
{

  JobConf conf = new JobConf(this.getClass());
  conf.setJobName("wordcount");

  conf.setOutputKeyClass(Text.class);
  conf.setOutputValueClass(IntWritable.class);

  conf.setMapperClass(Map.class);
  conf.setCombinerClass(Reduce.class);
  conf.setReducerClass(Reduce.class);

  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputFormat(TextOutputFormat.class);

  FileInputFormat.setInputPaths(conf, new Path(args[0]));
  FileOutputFormat.setOutputPath(conf, new Path(args[1]));

  JobClient.runJob(conf);
}

Source File: TestMultipleInputs.java From big-c with Apache License 2.0

6 votes

public void testAddInputPathWithMapper() {
  final JobConf conf = new JobConf();
  MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
     MapClass.class);
  MultipleInputs.addInputPath(conf, new Path("/bar"),
     KeyValueTextInputFormat.class, MapClass2.class);
  final Map<Path, InputFormat> inputs = MultipleInputs
     .getInputFormatMap(conf);
  final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
     .getMapperTypeMap(conf);

  assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
  assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
     .getClass());
  assertEquals(MapClass.class, maps.get(new Path("/foo")));
  assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}

Source File: PipeMapper.java From hadoop-gpu with Apache License 2.0

6 votes

public void configure(JobConf job) {
  super.configure(job);
  //disable the auto increment of the counter. For streaming, no of 
  //processed records could be different(equal or less) than the no of 
  //records input.
  SkipBadRecords.setAutoIncrMapperProcCount(job, false);
  skipping = job.getBoolean("mapred.skip.on", false);
  String inputFormatClassName = job.getClass("mapred.input.format.class", TextInputFormat.class).getCanonicalName();
  ignoreKey = inputFormatClassName.equals(TextInputFormat.class.getCanonicalName());

  try {
    mapOutputFieldSeparator = job.get("stream.map.output.field.separator", "\t").getBytes("UTF-8");
    mapInputFieldSeparator = job.get("stream.map.input.field.separator", "\t").getBytes("UTF-8");
    numOfMapOutputKeyFields = job.getInt("stream.num.map.output.key.fields", 1);
  } catch (UnsupportedEncodingException e) {
    throw new RuntimeException("The current system does not support UTF-8 encoding!", e);
  }
}

Source File: PersonVersion.java From blog with MIT License

6 votes

private static void runJobPv(String inputDir, String outputDir, String jobName, Class<? extends Mapper> mapClass,
                             Class<? extends Reducer> reduceClass) throws Exception {
    JobConf conf = new JobConf(PersonVersion.class);
    conf.setJobName(jobName);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(mapClass);
    conf.setCombinerClass(reduceClass);
    conf.setReducerClass(reduceClass);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, inputDir);
    FileOutputFormat.setOutputPath(conf, new Path(outputDir));

    JobClient.runJob(conf);
}

Source File: FrameReaderTextCell.java From systemds with Apache License 2.0

6 votes

protected void readTextCellFrameFromHDFS( Path path, JobConf job, FileSystem fs, FrameBlock dest, 
		ValueType[] schema, String[] names, long rlen, long clen)
	throws IOException
{
	if( fs.isDirectory(path) ) {
		FileInputFormat.addInputPath(job, path);
		TextInputFormat informat = new TextInputFormat();
		informat.configure(job);
		InputSplit[] splits = informat.getSplits(job, 1);
		for(InputSplit split: splits)
			readTextCellFrameFromInputSplit(split, informat, job, dest);
	}
	else {
		readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen);
	}
}

Source File: ReadExistingDataJob.java From tracing-framework with BSD 3-Clause "New" or "Revised" License

6 votes

public void configure(JobConf job) {
    // Set the mapper and reducers
    job.setMapperClass(ReadDataJob.TestMapper.class);

    // Make sure this jar is included
    job.setJarByClass(ReadDataJob.TestMapper.class);

    // Specify the input and output data formats
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);

    // Turn off speculative execution
    job.setMapSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);

    // Add the job input path
    FileInputFormat.addInputPath(job, new Path(this.input_path));
}

Source File: TestUtils.java From circus-train with Apache License 2.0

5 votes

public static Table createUnpartitionedTable(
    HiveMetaStoreClient metaStoreClient,
    String database,
    String table,
    URI location)
  throws TException {
  Table hiveTable = new Table();
  hiveTable.setDbName(database);
  hiveTable.setTableName(table);
  hiveTable.setTableType(TableType.EXTERNAL_TABLE.name());
  hiveTable.putToParameters("EXTERNAL", "TRUE");

  StorageDescriptor sd = new StorageDescriptor();
  sd.setCols(DATA_COLUMNS);
  sd.setLocation(location.toString());
  sd.setParameters(new HashMap<String, String>());
  sd.setInputFormat(TextInputFormat.class.getName());
  sd.setOutputFormat(TextOutputFormat.class.getName());
  sd.setSerdeInfo(new SerDeInfo());
  sd.getSerdeInfo().setSerializationLib("org.apache.hadoop.hive.serde2.OpenCSVSerde");

  hiveTable.setSd(sd);

  metaStoreClient.createTable(hiveTable);

  ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(true, database, table);
  ColumnStatisticsData statsData = new ColumnStatisticsData(_Fields.LONG_STATS, new LongColumnStatsData(1L, 2L));
  ColumnStatisticsObj cso1 = new ColumnStatisticsObj("id", "bigint", statsData);
  List<ColumnStatisticsObj> statsObj = Collections.singletonList(cso1);
  metaStoreClient.updateTableColumnStatistics(new ColumnStatistics(statsDesc, statsObj));

  return hiveTable;
}

Source File: FrameReaderJSONLParallel.java From systemds with Apache License 2.0

5 votes

public ReadRowsTask(InputSplit split, TextInputFormat inputFormat, JobConf jobConf,
	FrameBlock dest, Map<String, Integer> schemaMap, int offset)
{
	_split = split;
	_inputFormat = inputFormat;
	_jobConf = jobConf;
	_dest = dest;
	_schemaMap = schemaMap;
	_offset = offset;
}

Source File: ReaderTextLIBSVMParallel.java From systemds with Apache License 2.0

5 votes

@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen,
		int blen, long estnnz) 
	throws IOException, DMLRuntimeException 
{
	// prepare file access
	JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
	Path path = new Path(fname);
	FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
	
	FileInputFormat.addInputPath(job, path);
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);

	InputSplit[] splits = informat.getSplits(job, _numThreads);
	splits = IOUtilFunctions.sortInputSplits(splits);

	// check existence and non-empty file
	checkValidInputFile(fs, path);

	// allocate output matrix block
	// First Read Pass (count rows/cols, determine offsets, allocate matrix block)
	MatrixBlock ret = computeLIBSVMSizeAndCreateOutputMatrixBlock(splits, path, job, rlen, clen, estnnz);
	rlen = ret.getNumRows();
	clen = ret.getNumColumns();

	// Second Read Pass (read, parse strings, append to matrix block)
	readLIBSVMMatrixFromHDFS(splits, path, job, ret, rlen, clen, blen);
	
	//post-processing (representation-specific, change of sparse/dense block representation)
	// - nnz explicitly maintained in parallel for the individual splits
	ret.examSparsity();

	// sanity check for parallel row count (since determined internally)
	if (rlen >= 0 && rlen != ret.getNumRows())
		throw new DMLRuntimeException("Read matrix inconsistent with given meta data: "
				+ "expected nrow="+ rlen + ", real nrow=" + ret.getNumRows());

	return ret;
}

Source File: ReaderTextLIBSVMParallel.java From systemds with Apache License 2.0

5 votes

@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen,
		int blen, long estnnz) 
	throws IOException, DMLRuntimeException 
{
	// prepare file access
	JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
	Path path = new Path(fname);
	FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
	
	FileInputFormat.addInputPath(job, path);
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);

	InputSplit[] splits = informat.getSplits(job, _numThreads);
	splits = IOUtilFunctions.sortInputSplits(splits);

	// check existence and non-empty file
	checkValidInputFile(fs, path);

	// allocate output matrix block
	// First Read Pass (count rows/cols, determine offsets, allocate matrix block)
	MatrixBlock ret = computeLIBSVMSizeAndCreateOutputMatrixBlock(splits, path, job, rlen, clen, estnnz);
	rlen = ret.getNumRows();
	clen = ret.getNumColumns();

	// Second Read Pass (read, parse strings, append to matrix block)
	readLIBSVMMatrixFromHDFS(splits, path, job, ret, rlen, clen, blen);
	
	//post-processing (representation-specific, change of sparse/dense block representation)
	// - nnz explicitly maintained in parallel for the individual splits
	ret.examSparsity();

	// sanity check for parallel row count (since determined internally)
	if (rlen >= 0 && rlen != ret.getNumRows())
		throw new DMLRuntimeException("Read matrix inconsistent with given meta data: "
				+ "expected nrow="+ rlen + ", real nrow=" + ret.getNumRows());

	return ret;
}

Source File: TestMROldApiJobs.java From hadoop with Apache License 2.0

5 votes

static boolean runJob(JobConf conf, Path inDir, Path outDir, int numMaps, 
                         int numReds) throws IOException, InterruptedException {

  FileSystem fs = FileSystem.get(conf);
  if (fs.exists(outDir)) {
    fs.delete(outDir, true);
  }
  if (!fs.exists(inDir)) {
    fs.mkdirs(inDir);
  }
  String input = "The quick brown fox\n" + "has many silly\n"
      + "red fox sox\n";
  for (int i = 0; i < numMaps; ++i) {
    DataOutputStream file = fs.create(new Path(inDir, "part-" + i));
    file.writeBytes(input);
    file.close();
  }

  DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf, fs);
  conf.setOutputCommitter(CustomOutputCommitter.class);
  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputKeyClass(LongWritable.class);
  conf.setOutputValueClass(Text.class);

  FileInputFormat.setInputPaths(conf, inDir);
  FileOutputFormat.setOutputPath(conf, outDir);
  conf.setNumMapTasks(numMaps);
  conf.setNumReduceTasks(numReds);

  JobClient jobClient = new JobClient(conf);
  
  RunningJob job = jobClient.submitJob(conf);
  return jobClient.monitorAndPrintJob(conf, job);
}

Source File: NewWordCountApplication.java From attic-apex-malhar with Apache License 2.0

5 votes

public void NewWordCountApplication()
{
  setMapClass(WordCount.Map.class);
  setReduceClass(WordCount.Reduce.class);
  setCombineClass(WordCount.Reduce.class);
  setInputFormat(TextInputFormat.class);
}

Source File: HiveMetaStoreBridgeTest.java From incubator-atlas with Apache License 2.0

5 votes

private Table createTestTable(String databaseName, String tableName) throws HiveException {
    Table table = new Table(databaseName, tableName);
    table.setInputFormatClass(TextInputFormat.class);
    table.setFields(new ArrayList<FieldSchema>() {{
        add(new FieldSchema("col1", "string", "comment1"));
    }
    });
    table.setTableType(TableType.EXTERNAL_TABLE);
    table.setDataLocation(new Path("somehdfspath"));
    return table;
}

Source File: TestMRAppWithCombiner.java From hadoop with Apache License 2.0

5 votes

@Test
public void testCombinerShouldUpdateTheReporter() throws Exception {
  JobConf conf = new JobConf(mrCluster.getConfig());
  int numMaps = 5;
  int numReds = 2;
  Path in = new Path(mrCluster.getTestWorkDir().getAbsolutePath(),
      "testCombinerShouldUpdateTheReporter-in");
  Path out = new Path(mrCluster.getTestWorkDir().getAbsolutePath(),
      "testCombinerShouldUpdateTheReporter-out");
  createInputOutPutFolder(in, out, numMaps);
  conf.setJobName("test-job-with-combiner");
  conf.setMapperClass(IdentityMapper.class);
  conf.setCombinerClass(MyCombinerToCheckReporter.class);
  //conf.setJarByClass(MyCombinerToCheckReporter.class);
  conf.setReducerClass(IdentityReducer.class);
  DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf);
  conf.setOutputCommitter(CustomOutputCommitter.class);
  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputKeyClass(LongWritable.class);
  conf.setOutputValueClass(Text.class);

  FileInputFormat.setInputPaths(conf, in);
  FileOutputFormat.setOutputPath(conf, out);
  conf.setNumMapTasks(numMaps);
  conf.setNumReduceTasks(numReds);
  
  runJob(conf);
}

Source File: PipesNonJavaInputFormat.java From big-c with Apache License 2.0

5 votes

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
  // Delegate the generation of input splits to the 'original' InputFormat
  return ReflectionUtils.newInstance(
      job.getClass(Submitter.INPUT_FORMAT, 
                   TextInputFormat.class, 
                   InputFormat.class), job).getSplits(job, numSplits);
}

Source File: FieldSelectionMapReduce.java From big-c with Apache License 2.0

5 votes

public void configure(JobConf job) {
  this.fieldSeparator = job.get(FieldSelectionHelper.DATA_FIELD_SEPERATOR,
      "\t");
  this.mapOutputKeyValueSpec = job.get(
      FieldSelectionHelper.MAP_OUTPUT_KEY_VALUE_SPEC, "0-:");
  this.ignoreInputKey = TextInputFormat.class.getCanonicalName().equals(
      job.getInputFormat().getClass().getCanonicalName());
  this.reduceOutputKeyValueSpec = job.get(
      FieldSelectionHelper.REDUCE_OUTPUT_KEY_VALUE_SPEC, "0-:");
  parseOutputKeyValueSpec();
  LOG.info(specToString());
}

Source File: ReaderTextLIBSVMParallel.java From systemds with Apache License 2.0

5 votes

private void readLIBSVMMatrixFromHDFS(InputSplit[] splits, Path path, JobConf job, 
		MatrixBlock dest, long rlen, long clen, int blen) 
	throws IOException 
{
	FileInputFormat.addInputPath(job, path);
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);

	ExecutorService pool = CommonThreadPool.get(_numThreads);

	try 
	{
		// create read tasks for all splits
		ArrayList<LIBSVMReadTask> tasks = new ArrayList<>();
		int splitCount = 0;
		for (InputSplit split : splits) {
			tasks.add( new LIBSVMReadTask(split, _offsets, informat, job, dest, rlen, clen, splitCount++) );
		}
		pool.invokeAll(tasks);
		pool.shutdown();

		// check return codes and aggregate nnz
		long lnnz = 0;
		for (LIBSVMReadTask rt : tasks) {
			lnnz += rt.getPartialNnz();
			if (!rt.getReturnCode()) {
				Exception err = rt.getException();
				throw new IOException("Read task for libsvm input failed: "+ err.toString(), err);
			}
		}
		dest.setNonZeros(lnnz);
	} 
	catch (Exception e) {
		throw new IOException("Threadpool issue, while parallel read.", e);
	}
}

Source File: PipesNonJavaInputFormat.java From hadoop with Apache License 2.0

5 votes

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
  // Delegate the generation of input splits to the 'original' InputFormat
  return ReflectionUtils.newInstance(
      job.getClass(Submitter.INPUT_FORMAT, 
                   TextInputFormat.class, 
                   InputFormat.class), job).getSplits(job, numSplits);
}

Source File: ProfileFactoryTest.java From pxf with Apache License 2.0

5 votes

@Test
public void get() throws Exception {

    // For TextInputFormat when table has no complex types, HiveText profile should be used
    String profileName = ProfileFactory.get(new TextInputFormat(), false);
    assertEquals("HiveText", profileName);

    // For TextInputFormat when table has complex types, Hive profile should be used, HiveText doesn't support complex types yet
    profileName = ProfileFactory.get(new TextInputFormat(), true);
    assertEquals("Hive", profileName);

    // For RCFileInputFormat when table has complex types, HiveRC profile should be used
    profileName = ProfileFactory.get(new RCFileInputFormat(), true);
    assertEquals("HiveRC", profileName);

    // For RCFileInputFormat when table has no complex types, HiveRC profile should be used
    profileName = ProfileFactory.get(new RCFileInputFormat(), false);
    assertEquals("HiveRC", profileName);

    // For OrcInputFormat when table has complex types, HiveORC profile should be used
    profileName = ProfileFactory.get(new OrcInputFormat(), true);
    assertEquals("HiveORC", profileName);

    // For OrcInputFormat when table has no complex types, HiveORC profile should be used
    profileName = ProfileFactory.get(new OrcInputFormat(), false);
    assertEquals("HiveORC", profileName);

    // For other formats Hive profile should be used
    profileName = ProfileFactory.get(new SequenceFileInputFilter(), false);
    assertEquals("Hive", profileName);
}

Source File: ReaderTextCellParallel.java From systemds with Apache License 2.0

5 votes

public CountNnzTask( InputSplit split, TextInputFormat informat, JobConf job, int[] rNnz, boolean isSymmetric ) {
	_split = split;
	_informat = informat;
	_job = job;
	_rNnz = rNnz;
	_isSymmetric = isSymmetric;
}

Source File: CSVIngestMapper.java From hadoop-solr with Apache License 2.0

5 votes

@Override
public void init(JobConf conf) throws IOException {
  boolean override = conf.getBoolean(IngestJob.INPUT_FORMAT_OVERRIDE, false);
  if (!override) {
    conf.setInputFormat(TextInputFormat.class);
  }// else the user has overridden the input format and we assume it is OK.
  byte[] delimiterBase64 = Base64.encodeBase64(conf.get(CSV_DELIMITER, "").getBytes());
  conf.set(CSV_DELIMITER, new String(delimiterBase64));
}

Source File: LogsCountApplication.java From attic-apex-malhar with Apache License 2.0

5 votes

public void LogsCountApplication()
{
  setMapClass(LogCountsPerHour.LogMapClass.class);
  // setCombineClass(LogCountsPerHour.LogReduce.class);
  setReduceClass(LogCountsPerHour.LogReduce.class);
  setInputFormat(TextInputFormat.class);

}

Source File: ReaderTextCellParallel.java From systemds with Apache License 2.0

5 votes

public ReadTask( InputSplit split, TextInputFormat informat, JobConf job, MatrixBlock dest, long rlen, long clen, boolean mm, FileFormatPropertiesMM mmProps ) {
	_split = split;
	_sparse = dest.isInSparseFormat();
	_informat = informat;
	_job = job;
	_dest = dest;
	_rlen = rlen;
	_clen = clen;
	_matrixMarket = mm;
	_mmProps = mmProps;
}

Source File: HadoopMapredCompatWordCount.java From flink with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
	TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	DataSet<Tuple2<Text, LongWritable>> words =
			text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer()))
				.groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter()));

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat =
			new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
	hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
	TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

	// Output & Execute
	words.output(hadoopOutputFormat).setParallelism(1);
	env.execute("Hadoop Compat WordCount");
}

Source File: WordCountWithHadoopOutputFormat.java From stratosphere with Apache License 2.0

5 votes

@Override
public Plan getPlan(String... args) {
	// parse job parameters
	int numSubTasks   = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
	String dataInput = (args.length > 1 ? args[1] : "");
	String output    = (args.length > 2 ? args[2] : "");

	HadoopDataSource<LongWritable, Text> source = new HadoopDataSource<LongWritable, Text>(
			new TextInputFormat(), new JobConf(), "Input Lines");
	TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));


	MapOperator mapper = MapOperator.builder(new TokenizeLine())
			.input(source)
			.name("Tokenize Lines")
			.build();
	ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0)
			.input(mapper)
			.name("Count Words")
			.build();
	HadoopDataSink<Text, IntWritable> out = new HadoopDataSink<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(),new JobConf(), "Hadoop TextOutputFormat", reducer, Text.class, IntWritable.class);
	TextOutputFormat.setOutputPath(out.getJobConf(), new Path(output));

	Plan plan = new Plan(out, "Hadoop OutputFormat Example");
	plan.setDefaultParallelism(numSubTasks);
	return plan;
}

Source File: ReaderTextLIBSVMParallel.java From systemds with Apache License 2.0

5 votes

private void readLIBSVMMatrixFromHDFS(InputSplit[] splits, Path path, JobConf job, 
		MatrixBlock dest, long rlen, long clen, int blen) 
	throws IOException 
{
	FileInputFormat.addInputPath(job, path);
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);

	ExecutorService pool = CommonThreadPool.get(_numThreads);

	try 
	{
		// create read tasks for all splits
		ArrayList<LIBSVMReadTask> tasks = new ArrayList<>();
		int splitCount = 0;
		for (InputSplit split : splits) {
			tasks.add( new LIBSVMReadTask(split, _offsets, informat, job, dest, rlen, clen, splitCount++) );
		}
		pool.invokeAll(tasks);
		pool.shutdown();

		// check return codes and aggregate nnz
		long lnnz = 0;
		for (LIBSVMReadTask rt : tasks) {
			lnnz += rt.getPartialNnz();
			if (!rt.getReturnCode()) {
				Exception err = rt.getException();
				throw new IOException("Read task for libsvm input failed: "+ err.toString(), err);
			}
		}
		dest.setNonZeros(lnnz);
	} 
	catch (Exception e) {
		throw new IOException("Threadpool issue, while parallel read.", e);
	}
}

Source File: S3SelectPushdown.java From presto with Apache License 2.0

5 votes

public static boolean isCompressionCodecSupported(InputFormat<?, ?> inputFormat, Path path)
{
    if (inputFormat instanceof TextInputFormat) {
        return getCompressionCodec((TextInputFormat) inputFormat, path)
                .map(codec -> (codec instanceof GzipCodec) || (codec instanceof BZip2Codec))
                .orElse(false); // TODO (https://github.com/prestosql/presto/issues/2475) fix S3 Select when file not compressed
    }

    return false;
}

org.apache.hadoop.mapred.TextInputFormat Java Examples