Java Code Examples for org.apache.hadoop.mapred.FileInputFormat#setInputPaths()

The following examples show how to use org.apache.hadoop.mapred.FileInputFormat#setInputPaths() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MultipleHiveFragmentsPerFileFragmenter.java    From pxf with Apache License 2.0 6 votes vote down vote up
private String getFilePath(Table tbl) throws Exception {

        StorageDescriptor descTable = tbl.getSd();

        InputFormat<?, ?> fformat = HiveDataFragmenter.makeInputFormat(descTable.getInputFormat(), jobConf);

        FileInputFormat.setInputPaths(jobConf, new Path(descTable.getLocation()));

        InputSplit[] splits;
        try {
            splits = fformat.getSplits(jobConf, 1);
        } catch (org.apache.hadoop.mapred.InvalidInputException e) {
            LOG.debug("getSplits failed on " + e.getMessage());
            throw new RuntimeException("Unable to get file path for table.");
        }

        for (InputSplit split : splits) {
            FileSplit fsp = (FileSplit) split;
            String[] hosts = fsp.getLocations();
            String filepath = fsp.getPath().toString();
            return filepath;
        }
        throw new RuntimeException("Unable to get file path for table.");
    }
 
Example 2
Source File: TestFileSystem.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
public static void readTest(FileSystem fs, boolean fastCheck)
  throws Exception {

  fs.delete(READ_DIR, true);

  JobConf job = new JobConf(conf, TestFileSystem.class);
  job.setBoolean("fs.test.fastCheck", fastCheck);


  FileInputFormat.setInputPaths(job, CONTROL_DIR);
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(ReadMapper.class);
  job.setReducerClass(LongSumReducer.class);

  FileOutputFormat.setOutputPath(job, READ_DIR);
  job.setOutputKeyClass(UTF8.class);
  job.setOutputValueClass(LongWritable.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}
 
Example 3
Source File: TestDFSIO.java    From hadoop with Apache License 2.0 6 votes vote down vote up
private void runIOTest(
        Class<? extends Mapper<Text, LongWritable, Text, Text>> mapperClass, 
        Path outputDir) throws IOException {
  JobConf job = new JobConf(config, TestDFSIO.class);

  FileInputFormat.setInputPaths(job, getControlDir(config));
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(mapperClass);
  job.setReducerClass(AccumulatingReducer.class);

  FileOutputFormat.setOutputPath(job, outputDir);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}
 
Example 4
Source File: TestDFSIO.java    From big-c with Apache License 2.0 6 votes vote down vote up
private void runIOTest(
        Class<? extends Mapper<Text, LongWritable, Text, Text>> mapperClass, 
        Path outputDir) throws IOException {
  JobConf job = new JobConf(config, TestDFSIO.class);

  FileInputFormat.setInputPaths(job, getControlDir(config));
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(mapperClass);
  job.setReducerClass(AccumulatingReducer.class);

  FileOutputFormat.setOutputPath(job, outputDir);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}
 
Example 5
Source File: TestHoodieParquetInputFormat.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Test
public void testInputFormatUpdates() throws IOException {
  // initial commit
  File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
  InputFormatTestUtil.commit(basePath, "100");

  // Add the paths
  FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

  FileStatus[] files = inputFormat.listStatus(jobConf);
  assertEquals(10, files.length);

  // update files
  InputFormatTestUtil.simulateUpdates(partitionDir, baseFileExtension, "100", 5, "200", true);
  // Before the commit
  files = inputFormat.listStatus(jobConf);
  assertEquals(10, files.length);
  ensureFilesInCommit("Commit 200 has not been committed. We should not see files from this commit", files, "200", 0);
  InputFormatTestUtil.commit(basePath, "200");
  files = inputFormat.listStatus(jobConf);
  assertEquals(10, files.length);
  ensureFilesInCommit("5 files have been updated to commit 200. We should see 5 files from commit 200 and 5 "
      + "files from 100 commit", files, "200", 5);
  ensureFilesInCommit("5 files have been updated to commit 200. We should see 5 files from commit 100 and 5 "
      + "files from 200 commit", files, "100", 5);
}
 
Example 6
Source File: TestMultiMRInput.java    From tez with Apache License 2.0 5 votes vote down vote up
@Test(timeout = 5000)
public void testExtraEvents() throws Exception {
  Path workDir = new Path(TEST_ROOT_DIR, "testExtraEvents");
  JobConf jobConf = new JobConf(defaultConf);
  jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
  FileInputFormat.setInputPaths(jobConf, workDir);

  InputContext inputContext = createTezInputContext(jobConf, new Configuration(false));

  MultiMRInput input = new MultiMRInput(inputContext, 1);
  input.initialize();

  createSplits(1, workDir, jobConf, new AtomicLong());

  SequenceFileInputFormat<LongWritable, Text> format =
      new SequenceFileInputFormat<LongWritable, Text>();
  InputSplit[] splits = format.getSplits(jobConf, 1);
  assertEquals(1, splits.length);

  MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]);
  InputDataInformationEvent event1 =
      InputDataInformationEvent.createWithSerializedPayload(0,
          splitProto.toByteString().asReadOnlyByteBuffer());
  InputDataInformationEvent event2 =
      InputDataInformationEvent.createWithSerializedPayload(1,
          splitProto.toByteString().asReadOnlyByteBuffer());

  List<Event> eventList = new ArrayList<Event>();
  eventList.add(event1);
  eventList.add(event2);
  try {
    input.handleEvents(eventList);
    fail("Expecting Exception due to too many events");
  } catch (Exception e) {
    assertTrue(e.getMessage().contains(
        "Unexpected event. All physical sources already initialized"));
  }
}
 
Example 7
Source File: EthereumFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 5 votes vote down vote up
@Test
public void readEthereumBlockInputFormatBlock447533() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName="block447533.bin";
	String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
	Path file = new Path(fileNameBlock);
    FileInputFormat.setInputPaths(job, file);
    EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
    format.configure(job);
    InputSplit[] inputSplits = format.getSplits(job,1);
  
    assertEquals( 1, inputSplits.length,"Only one split generated for block 447533");
    	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull( reader,"Format returned  null RecordReader");
	BytesWritable key = new BytesWritable();	
	EthereumBlock block = new EthereumBlock();
	assertTrue( reader.next(key,block),"Input Split for block 447533 contains at least one block");
	assertEquals( 2, block.getEthereumTransactions().size(),"Block 447533 must have 2 transactions");
	EthereumBlockHeader ethereumBlockHeader = block.getEthereumBlockHeader();
	assertEquals(
			"a027231f42c80ca4125b5cb962a21cd4f812e88f",
			bytesToHex(ethereumBlockHeader.getCoinBase()).toLowerCase(),
			"Block 447533 was mined by a027231f42c80ca4125b5cb962a21cd4f812e88f"
	);
	assertEquals(
			"043559b70c54f0eea6a90b384286d7ab312129603e750075d09fd35e66f8068a",
			bytesToHex(ethereumBlockHeader.getParentHash()).toLowerCase(),
			"The parent of block 447533 has hash 043559b70c54f0eea6a90b384286d7ab312129603e750075d09fd35e66f8068a"
	);
    	assertFalse( reader.next(key,block),"No further block  in  block 447533");
    	
    	reader.close();
	
}
 
Example 8
Source File: TestMROldApiJobs.java    From hadoop with Apache License 2.0 5 votes vote down vote up
static boolean runJob(JobConf conf, Path inDir, Path outDir, int numMaps, 
                         int numReds) throws IOException, InterruptedException {

  FileSystem fs = FileSystem.get(conf);
  if (fs.exists(outDir)) {
    fs.delete(outDir, true);
  }
  if (!fs.exists(inDir)) {
    fs.mkdirs(inDir);
  }
  String input = "The quick brown fox\n" + "has many silly\n"
      + "red fox sox\n";
  for (int i = 0; i < numMaps; ++i) {
    DataOutputStream file = fs.create(new Path(inDir, "part-" + i));
    file.writeBytes(input);
    file.close();
  }

  DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf, fs);
  conf.setOutputCommitter(CustomOutputCommitter.class);
  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputKeyClass(LongWritable.class);
  conf.setOutputValueClass(Text.class);

  FileInputFormat.setInputPaths(conf, inDir);
  FileOutputFormat.setOutputPath(conf, outDir);
  conf.setNumMapTasks(numMaps);
  conf.setNumReduceTasks(numReds);

  JobClient jobClient = new JobClient(conf);
  
  RunningJob job = jobClient.submitJob(conf);
  return jobClient.monitorAndPrintJob(conf, job);
}
 
Example 9
Source File: TestHoodieParquetInputFormat.java    From hudi with Apache License 2.0 5 votes vote down vote up
public void testPredicatePushDown() throws IOException {
  // initial commit
  Schema schema = InputFormatTestUtil.readSchema("/sample1.avsc");
  String commit1 = "20160628071126";
  File partitionDir = InputFormatTestUtil.prepareParquetTable(basePath, schema, 1, 10, commit1);
  InputFormatTestUtil.commit(basePath, commit1);
  // Add the paths
  FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
  // check whether we have 10 records at this point
  ensureRecordsInCommit("We need to have 10 records at this point for commit " + commit1, commit1, 10, 10);

  // update 2 records in the original parquet file and save it as commit 200
  String commit2 = "20160629193623";
  InputFormatTestUtil.simulateParquetUpdates(partitionDir, schema, commit1, 10, 2, commit2);
  InputFormatTestUtil.commit(basePath, commit2);

  InputFormatTestUtil.setupIncremental(jobConf, commit1, 1);
  // check whether we have 2 records at this point
  ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2,
      2, 2);
  // Make sure we have the 10 records if we roll back the stattime
  InputFormatTestUtil.setupIncremental(jobConf, "0", 2);
  ensureRecordsInCommit("We need to have 8 records that was modified at commit " + commit1 + " and no more", commit1,
      8, 10);
  ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2,
      2, 10);
}
 
Example 10
Source File: MapOperator.java    From attic-apex-malhar with Apache License 2.0 5 votes vote down vote up
private InputSplit[] getSplits(JobConf conf, int numSplits, String path) throws Exception
{
  FileInputFormat.setInputPaths(conf, new Path(path));
  if (inputFormat == null) {
    inputFormat = inputFormatClass.newInstance();
    String inputFormatClassName = inputFormatClass.getName();
    if (inputFormatClassName.equals("org.apache.hadoop.mapred.TextInputFormat")) {
      ((TextInputFormat)inputFormat).configure(conf);
    } else if (inputFormatClassName.equals("org.apache.hadoop.mapred.KeyValueTextInputFormat")) {
      ((KeyValueTextInputFormat)inputFormat).configure(conf);
    }
  }
  return inputFormat.getSplits(conf, numSplits);
  // return null;
}
 
Example 11
Source File: TopBusyAirportGemfirexd.java    From gemfirexd-oss with Apache License 2.0 4 votes vote down vote up
public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();

    JobConf conf = new JobConf(getConf());
    conf.setJobName("Busy Airport Count");

    Path outputPath = new Path(args[0]);
    Path intermediateOutputPath = new Path(args[0] + "_int");
    String hdfsHomeDir = args[1];
    String tableName = args[2];

    outputPath.getFileSystem(conf).delete(outputPath, true);
    intermediateOutputPath.getFileSystem(conf).delete(intermediateOutputPath, true);

    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);

    conf.setInputFormat(RowInputFormat.class);
    conf.setMapperClass(SampleMapper.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setReducerClass(SampleReducer.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setOutputPath(conf, intermediateOutputPath);

    int rc = JobClient.runJob(conf).isSuccessful() ? 0 : 1;
    if (rc == 0) {
      JobConf topConf = new JobConf(getConf());
      topConf.setJobName("Top Busy Airport");

      String hdfsFS = topConf.get("fs.defaultFS");
      URI hdfsUri = URI.create(hdfsFS);
      hdfsUri.getHost();

      // Assume that SqlFire locator is running alongside the namenode
      topConf.set(RowOutputFormat.OUTPUT_URL, "jdbc:gemfirexd://" + hdfsUri.getHost() + ":1527");
      //topConf.set(ddGfxdOutputFormat.OUTPUT_SCHEMA, "APP");
      //topConf.set(GfxdOutputFormat.OUTPUT_TABLE, "BUSY_AIRPORT");
      topConf.set(RowOutputFormat.OUTPUT_TABLE, "APP.BUSY_AIRPORT");

      // Only run a single reducer
      topConf.setNumReduceTasks(1);

      FileInputFormat.setInputPaths(topConf, intermediateOutputPath);

      topConf.setInputFormat(TextInputFormat.class);
      topConf.setMapperClass(TopBusyAirportMapper.class);
      topConf.setMapOutputKeyClass(Text.class);
      topConf.setMapOutputValueClass(StringIntPair.class);

      topConf.setReducerClass(TopBusyAirportReducer.class);
      topConf.setOutputKeyClass(Key.class);
      topConf.setOutputValueClass(BusyAirportModel.class);
      topConf.setOutputFormat(RowOutputFormat.class);

      rc = JobClient.runJob(topConf).isSuccessful() ? 0 : 1;
    }
    return rc;
  }
 
Example 12
Source File: TestHoodieMergeOnReadTable.java    From hudi with Apache License 2.0 4 votes vote down vote up
private FileStatus[] getRTIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull)
        throws Exception {
  setupIncremental(rtJobConf, startCommitTime, numCommitsToPull, false);
  FileInputFormat.setInputPaths(rtJobConf, Paths.get(basePath, partitionPath).toString());
  return listStatus(rtJobConf, true);
}
 
Example 13
Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java    From hadoopoffice with Apache License 2.0 4 votes vote down vote up
@Test
public void readExcelInputFormatExcel2013SingleSheetLowFootprintStaxAllMemory() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2013test.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "de");

	// low footprint
	job.set("hadoopoffice.read.lowFootprint", "true");
	// stax parser
	job.set("hadoopoffice.read.lowFootprint.parser", "stax");
	// all memory
	job.set("hadoopoffice.read.lowFootprint.stax.sst.cache", "-1");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull(reader, "Format returned  null RecordReader");
	Text spreadSheetKey = new Text();
	ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1");
	assertEquals("[excel2013test.xlsx]Sheet1!A1", spreadSheetKey.toString(),
			"Input Split for Excel file has keyname == \"[excel2013test.xlsx]Sheet1!A1\"");
	assertEquals(4, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 4 columns");
	assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
	assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(),
			"Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
	assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(),
			"Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
	assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
	assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
	assertEquals("test4", ((SpreadSheetCellDAO) spreadSheetValue.get()[3]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 4 == \"test4\"");
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 2");
	assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contains row 2 with 1 column");
	assertEquals("4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 2 with cell 1 == \"4\"");
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 3");
	assertEquals(5, spreadSheetValue.get().length, "Input Split for Excel file contains row 3 with 5 columns");
	assertEquals("31/12/99", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 3 with cell 1 == \"31/12/99\"");
	assertEquals("5", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 3 with cell 2 == \"5\"");
	assertNull(spreadSheetValue.get()[2], "Input Split for Excel file contains row 3 with cell 3 == null");
	assertNull(spreadSheetValue.get()[3], "Input Split for Excel file contains row 3 with cell 4 == null");
	assertEquals("null", ((SpreadSheetCellDAO) spreadSheetValue.get()[4]).getFormattedValue(),
			"Input Split for Excel file contains row 3 with cell 5 == \"null\"");
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 4");
	assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contains row 4 with 1 column");
	assertEquals("1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 4 with cell 1 == \"1\"");
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 5");
	assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 5 with 3 columns");
	assertEquals("2", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 5 with cell 1 == \"2\"");
	assertEquals("6", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 5 with cell 2== \"6\"");
	assertEquals("10", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 5 with cell 3== \"10\"");
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 6");
	assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 6 with 3 columns");
	assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 6 with cell 1 == \"3\"");
	assertEquals("4", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 6 with cell 2== \"4\"");
	assertEquals("15", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 6 with cell 3== \"15\"");
}
 
Example 14
Source File: OfficeFormatHadoopExcelLowFootPrintSAXTest.java    From hadoopoffice with Apache License 2.0 4 votes vote down vote up
@Test
public void readExcelInputFormatExcel2003SingleSheetLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2003test.xls";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "de");
	// low footprint
	job.set("hadoopoffice.read.lowFootprint", "true");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull(reader, "Format returned  null RecordReader");
	Text spreadSheetKey = new Text();
	ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1");
	assertEquals("[excel2003test.xls]Sheet1!A1", spreadSheetKey.toString(),
			"Input Split for Excel file has keyname == \"[excel2003test.xls]Sheet1!A1\"");
	assertEquals(4, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 4 columns");
	assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
	assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(),
			"Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
	assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(),
			"Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
	assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
	assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
	assertEquals("test4", ((SpreadSheetCellDAO) spreadSheetValue.get()[3]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 4 == \"test4\"");
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 2");
	assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contains row 2 with 1 column");
	assertEquals("4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 2 with cell 1 == \"4\"");
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 3");
	assertEquals(5, spreadSheetValue.get().length, "Input Split for Excel file contains row 3 with 5 columns");
	assertEquals("31/12/99", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 3 with cell 1 == \"31/12/99\"");
	assertEquals("5", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 3 with cell 2 == \"5\"");
	assertNull(spreadSheetValue.get()[2], "Input Split for Excel file contains row 3 with cell 3 == null");
	assertNull(spreadSheetValue.get()[3], "Input Split for Excel file contains row 3 with cell 4 == null");
	assertEquals("null", ((SpreadSheetCellDAO) spreadSheetValue.get()[4]).getFormattedValue(),
			"Input Split for Excel file contains row 3 with cell 5 == \"null\"");
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 4");
	assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contains row 4 with 1 column");
	assertEquals("1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 4 with cell 1 == \"1\"");
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 5");
	assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 5 with 3 columns");
	assertEquals("2", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 5 with cell 1 == \"2\"");
	assertEquals("6", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 5 with cell 2== \"6\"");
	assertEquals("10", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 5 with cell 3== \"10\"");
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 6");
	assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 6 with 3 columns");
	assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 6 with cell 1 == \"3\"");
	assertEquals("4", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 6 with cell 2== \"4\"");
	assertEquals("15", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 6 with cell 3== \"15\"");
}
 
Example 15
Source File: TopBusyAirportGemfirexd.java    From gemfirexd-oss with Apache License 2.0 4 votes vote down vote up
public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();

    JobConf conf = new JobConf(getConf());
    conf.setJobName("Busy Airport Count");

    Path outputPath = new Path(args[0]);
    Path intermediateOutputPath = new Path(args[0] + "_int");
    String hdfsHomeDir = args[1];
    String tableName = args[2];

    outputPath.getFileSystem(conf).delete(outputPath, true);
    intermediateOutputPath.getFileSystem(conf).delete(intermediateOutputPath, true);

    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);

    conf.setInputFormat(RowInputFormat.class);
    conf.setMapperClass(SampleMapper.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setReducerClass(SampleReducer.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setOutputPath(conf, intermediateOutputPath);

    int rc = JobClient.runJob(conf).isSuccessful() ? 0 : 1;
    if (rc == 0) {
      JobConf topConf = new JobConf(getConf());
      topConf.setJobName("Top Busy Airport");

      String hdfsFS = topConf.get("fs.defaultFS");
      URI hdfsUri = URI.create(hdfsFS);
      hdfsUri.getHost();

      // Assume that SqlFire locator is running alongside the namenode
      topConf.set(RowOutputFormat.OUTPUT_URL, "jdbc:gemfirexd://" + hdfsUri.getHost() + ":1527");
      //topConf.set(ddGfxdOutputFormat.OUTPUT_SCHEMA, "APP");
      //topConf.set(GfxdOutputFormat.OUTPUT_TABLE, "BUSY_AIRPORT");
      topConf.set(RowOutputFormat.OUTPUT_TABLE, "APP.BUSY_AIRPORT");

      // Only run a single reducer
      topConf.setNumReduceTasks(1);

      FileInputFormat.setInputPaths(topConf, intermediateOutputPath);

      topConf.setInputFormat(TextInputFormat.class);
      topConf.setMapperClass(TopBusyAirportMapper.class);
      topConf.setMapOutputKeyClass(Text.class);
      topConf.setMapOutputValueClass(StringIntPair.class);

      topConf.setReducerClass(TopBusyAirportReducer.class);
      topConf.setOutputKeyClass(Key.class);
      topConf.setOutputValueClass(BusyAirportModel.class);
      topConf.setOutputFormat(RowOutputFormat.class);

      rc = JobClient.runJob(topConf).isSuccessful() ? 0 : 1;
    }
    return rc;
  }
 
Example 16
Source File: TestKeyFieldBasedComparator.java    From RDFS with Apache License 2.0 4 votes vote down vote up
public void configure(String keySpec, int expect) throws Exception {
  Path testdir = new Path("build/test/test.mapred.spill");
  Path inDir = new Path(testdir, "in");
  Path outDir = new Path(testdir, "out");
  FileSystem fs = getFileSystem();
  fs.delete(testdir, true);
  conf.setInputFormat(TextInputFormat.class);
  FileInputFormat.setInputPaths(conf, inDir);
  FileOutputFormat.setOutputPath(conf, outDir);
  conf.setOutputKeyClass(Text.class);
  conf.setOutputValueClass(LongWritable.class);

  conf.setNumMapTasks(1);
  conf.setNumReduceTasks(2);

  conf.setOutputFormat(TextOutputFormat.class);
  conf.setOutputKeyComparatorClass(KeyFieldBasedComparator.class);
  conf.setKeyFieldComparatorOptions(keySpec);
  conf.setKeyFieldPartitionerOptions("-k1.1,1.1");
  conf.set("map.output.key.field.separator", " ");
  conf.setMapperClass(InverseMapper.class);
  conf.setReducerClass(IdentityReducer.class);
  if (!fs.mkdirs(testdir)) {
    throw new IOException("Mkdirs failed to create " + testdir.toString());
  }
  if (!fs.mkdirs(inDir)) {
    throw new IOException("Mkdirs failed to create " + inDir.toString());
  }
  // set up input data in 2 files 
  Path inFile = new Path(inDir, "part0");
  FileOutputStream fos = new FileOutputStream(inFile.toString());
  fos.write((line1 + "\n").getBytes());
  fos.write((line2 + "\n").getBytes());
  fos.close();
  JobClient jc = new JobClient(conf);
  RunningJob r_job = jc.submitJob(conf);
  while (!r_job.isComplete()) {
    Thread.sleep(1000);
  }
  
  if (!r_job.isSuccessful()) {
    fail("Oops! The job broke due to an unexpected error");
  }
  Path[] outputFiles = FileUtil.stat2Paths(
      getFileSystem().listStatus(outDir,
      new Utils.OutputFileUtils.OutputFilesFilter()));
  if (outputFiles.length > 0) {
    InputStream is = getFileSystem().open(outputFiles[0]);
    BufferedReader reader = new BufferedReader(new InputStreamReader(is));
    String line = reader.readLine();
    //make sure we get what we expect as the first line, and also
    //that we have two lines (both the lines must end up in the same
    //reducer since the partitioner takes the same key spec for all
    //lines
    if (expect == 1) {
      assertTrue(line.startsWith(line1));
    } else if (expect == 2) {
      assertTrue(line.startsWith(line2));
    }
    line = reader.readLine();
    if (expect == 1) {
      assertTrue(line.startsWith(line2));
    } else if (expect == 2) {
      assertTrue(line.startsWith(line1));
    }
    reader.close();
  }
}
 
Example 17
Source File: DataJoinJob.java    From big-c with Apache License 2.0 4 votes vote down vote up
public static JobConf createDataJoinJob(String args[]) throws IOException {

    String inputDir = args[0];
    String outputDir = args[1];
    Class inputFormat = SequenceFileInputFormat.class;
    if (args[2].compareToIgnoreCase("text") != 0) {
      System.out.println("Using SequenceFileInputFormat: " + args[2]);
    } else {
      System.out.println("Using TextInputFormat: " + args[2]);
      inputFormat = TextInputFormat.class;
    }
    int numOfReducers = Integer.parseInt(args[3]);
    Class mapper = getClassByName(args[4]);
    Class reducer = getClassByName(args[5]);
    Class mapoutputValueClass = getClassByName(args[6]);
    Class outputFormat = TextOutputFormat.class;
    Class outputValueClass = Text.class;
    if (args[7].compareToIgnoreCase("text") != 0) {
      System.out.println("Using SequenceFileOutputFormat: " + args[7]);
      outputFormat = SequenceFileOutputFormat.class;
      outputValueClass = getClassByName(args[7]);
    } else {
      System.out.println("Using TextOutputFormat: " + args[7]);
    }
    long maxNumOfValuesPerGroup = 100;
    String jobName = "";
    if (args.length > 8) {
      maxNumOfValuesPerGroup = Long.parseLong(args[8]);
    }
    if (args.length > 9) {
      jobName = args[9];
    }
    Configuration defaults = new Configuration();
    JobConf job = new JobConf(defaults, DataJoinJob.class);
    job.setJobName("DataJoinJob: " + jobName);

    FileSystem fs = FileSystem.get(defaults);
    fs.delete(new Path(outputDir), true);
    FileInputFormat.setInputPaths(job, inputDir);

    job.setInputFormat(inputFormat);

    job.setMapperClass(mapper);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));
    job.setOutputFormat(outputFormat);
    SequenceFileOutputFormat.setOutputCompressionType(job,
            SequenceFile.CompressionType.BLOCK);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(mapoutputValueClass);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(outputValueClass);
    job.setReducerClass(reducer);

    job.setNumMapTasks(1);
    job.setNumReduceTasks(numOfReducers);
    job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup);
    return job;
  }
 
Example 18
Source File: HoodieCombineHiveInputFormat.java    From hudi with Apache License 2.0 4 votes vote down vote up
private void processPaths(JobConf job, CombineFileInputFormatShim combine, List<CombineFileSplit> iss, Path... path)
    throws IOException {
  JobConf currJob = new JobConf(job);
  FileInputFormat.setInputPaths(currJob, path);
  iss.addAll(Arrays.asList(combine.getSplits(currJob, 1)));
}
 
Example 19
Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java    From hadoopoffice with Apache License 2.0 4 votes vote down vote up
@Test
public void readExcelInputFormatExcel2013SingleSheetLowFootprintStaxPartlyInMemory() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2013test.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "de");

	// low footprint
	job.set("hadoopoffice.read.lowFootprint", "true");
	// stax parser
	job.set("hadoopoffice.read.lowFootprint.parser", "stax");
	// partly in memory
	job.set("hadoopoffice.read.lowFootprint.stax.sst.cache", "2");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull(reader, "Format returned  null RecordReader");
	Text spreadSheetKey = new Text();
	ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1");
	assertEquals("[excel2013test.xlsx]Sheet1!A1", spreadSheetKey.toString(),
			"Input Split for Excel file has keyname == \"[excel2013test.xlsx]Sheet1!A1\"");
	assertEquals(4, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 4 columns");
	assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
	assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(),
			"Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
	assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(),
			"Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
	assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
	assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
	assertEquals("test4", ((SpreadSheetCellDAO) spreadSheetValue.get()[3]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 4 == \"test4\"");
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 2");
	assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contains row 2 with 1 column");
	assertEquals("4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 2 with cell 1 == \"4\"");
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 3");
	assertEquals(5, spreadSheetValue.get().length, "Input Split for Excel file contains row 3 with 5 columns");
	assertEquals("31/12/99", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 3 with cell 1 == \"31/12/99\"");
	assertEquals("5", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 3 with cell 2 == \"5\"");
	assertNull(spreadSheetValue.get()[2], "Input Split for Excel file contains row 3 with cell 3 == null");
	assertNull(spreadSheetValue.get()[3], "Input Split for Excel file contains row 3 with cell 4 == null");
	assertEquals("null", ((SpreadSheetCellDAO) spreadSheetValue.get()[4]).getFormattedValue(),
			"Input Split for Excel file contains row 3 with cell 5 == \"null\"");
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 4");
	assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contains row 4 with 1 column");
	assertEquals("1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 4 with cell 1 == \"1\"");
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 5");
	assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 5 with 3 columns");
	assertEquals("2", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 5 with cell 1 == \"2\"");
	assertEquals("6", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 5 with cell 2== \"6\"");
	assertEquals("10", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 5 with cell 3== \"10\"");
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 6");
	assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 6 with 3 columns");
	assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 6 with cell 1 == \"3\"");
	assertEquals("4", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 6 with cell 2== \"4\"");
	assertEquals("15", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 6 with cell 3== \"15\"");
}
 
Example 20
Source File: IndexUpdater.java    From hadoop-gpu with Apache License 2.0 4 votes vote down vote up
JobConf createJob(Configuration conf, Path[] inputPaths, Path outputPath,
    int numMapTasks, Shard[] shards) throws IOException {
  // set the starting generation for each shard
  // when a reduce task fails, a new reduce task
  // has to know where to re-start
  setShardGeneration(conf, shards);

  // iconf.set sets properties in conf
  IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf);
  Shard.setIndexShards(iconf, shards);

  // MapTask.MapOutputBuffer uses "io.sort.mb" to decide its max buffer size
  // (max buffer size = 1/2 * "io.sort.mb").
  // Here we half-en "io.sort.mb" because we use the other half memory to
  // build an intermediate form/index in Combiner.
  iconf.setIOSortMB(iconf.getIOSortMB() / 2);

  // create the job configuration
  JobConf jobConf = new JobConf(conf, IndexUpdater.class);
  jobConf.setJobName(this.getClass().getName() + "_"
      + System.currentTimeMillis());

  // provided by application
  FileInputFormat.setInputPaths(jobConf, inputPaths);
  FileOutputFormat.setOutputPath(jobConf, outputPath);

  jobConf.setNumMapTasks(numMapTasks);

  // already set shards
  jobConf.setNumReduceTasks(shards.length);

  jobConf.setInputFormat(iconf.getIndexInputFormatClass());

  Path[] inputs = FileInputFormat.getInputPaths(jobConf);
  StringBuilder buffer = new StringBuilder(inputs[0].toString());
  for (int i = 1; i < inputs.length; i++) {
    buffer.append(",");
    buffer.append(inputs[i].toString());
  }
  LOG.info("mapred.input.dir = " + buffer.toString());
  LOG.info("mapred.output.dir = " + 
           FileOutputFormat.getOutputPath(jobConf).toString());
  LOG.info("mapred.map.tasks = " + jobConf.getNumMapTasks());
  LOG.info("mapred.reduce.tasks = " + jobConf.getNumReduceTasks());
  LOG.info(shards.length + " shards = " + iconf.getIndexShards());
  // better if we don't create the input format instance
  LOG.info("mapred.input.format.class = "
      + jobConf.getInputFormat().getClass().getName());

  // set by the system
  jobConf.setMapOutputKeyClass(IndexUpdateMapper.getMapOutputKeyClass());
  jobConf.setMapOutputValueClass(IndexUpdateMapper.getMapOutputValueClass());
  jobConf.setOutputKeyClass(IndexUpdateReducer.getOutputKeyClass());
  jobConf.setOutputValueClass(IndexUpdateReducer.getOutputValueClass());

  jobConf.setMapperClass(IndexUpdateMapper.class);
  jobConf.setPartitionerClass(IndexUpdatePartitioner.class);
  jobConf.setCombinerClass(IndexUpdateCombiner.class);
  jobConf.setReducerClass(IndexUpdateReducer.class);

  jobConf.setOutputFormat(IndexUpdateOutputFormat.class);

  return jobConf;
}