org.apache.hadoop.mapred.FileInputFormat Java Examples

The following examples show how to use org.apache.hadoop.mapred.FileInputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestFileSystem.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
public static void readTest(FileSystem fs, boolean fastCheck)
  throws Exception {

  fs.delete(READ_DIR, true);

  JobConf job = new JobConf(conf, TestFileSystem.class);
  job.setBoolean("fs.test.fastCheck", fastCheck);


  FileInputFormat.setInputPaths(job, CONTROL_DIR);
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(ReadMapper.class);
  job.setReducerClass(LongSumReducer.class);

  FileOutputFormat.setOutputPath(job, READ_DIR);
  job.setOutputKeyClass(UTF8.class);
  job.setOutputValueClass(LongWritable.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}
 
Example #2
Source File: HiveAccumuloInputFormatTest.java    From accumulo-hive-storage-manager with Apache License 2.0 6 votes vote down vote up
@Test
public void getNone() {
    FileInputFormat.addInputPath(conf, new Path("unused"));
    conf.set(AccumuloSerde.COLUMN_MAPPINGS, "cf|f1");
    try {
        InputSplit[] splits = inputformat.getSplits(conf, 0);
        assertEquals(splits.length, 1);
        RecordReader<Text,AccumuloHiveRow> reader = inputformat.getRecordReader(splits[0], conf, null);
        Text rowId = new Text("r1");
        AccumuloHiveRow row = new AccumuloHiveRow();
        row.setRowId("r1");
        assertFalse(reader.next(rowId, row));
    }catch (IOException e) {
        log.error(e);
        fail();
    }
    //for whatever reason, this errors unless I manually reset. mockKeyValues() ignored @BeforeTest
    conf.set(AccumuloSerde.COLUMN_MAPPINGS, "cf|name,cf|sid,cf|dgrs,cf|mills");
}
 
Example #3
Source File: BaseAllocator.java    From HiveKa with Apache License 2.0 6 votes vote down vote up
@Override
public InputSplit[] allocateWork(List<CamusRequest> requests,
    JobConf conf) throws IOException {
  int numTasks = conf.getInt("mapred.map.tasks", 30);
  
  reverseSortRequests(requests);

  List<InputSplit> kafkaETLSplits = new ArrayList<InputSplit>();
  Path[] tablePaths = FileInputFormat.getInputPaths(conf);

  for (int i = 0; i < numTasks; i++) {
    if (requests.size() > 0) {
      kafkaETLSplits.add(new KafkaSplit(tablePaths[0]));
    }
  }

  for (CamusRequest r : requests) {
    getSmallestMultiSplit(kafkaETLSplits).addRequest(r);
  }

  InputSplit[] inputSplits = new InputSplit[kafkaETLSplits.size()];

  return kafkaETLSplits.toArray(inputSplits);
}
 
Example #4
Source File: NNBench.java    From RDFS with Apache License 2.0 6 votes vote down vote up
/**
 * Run the test
 * 
 * @throws IOException on error
 */
public static void runTests(Configuration config) throws IOException {
  config.setLong("io.bytes.per.checksum", bytesPerChecksum);
  
  JobConf job = new JobConf(config, NNBench.class);

  job.setJobName("NNBench-" + operation);
  FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  
  // Explicitly set number of max map attempts to 1.
  job.setMaxMapAttempts(1);
  
  // Explicitly turn off speculative execution
  job.setSpeculativeExecution(false);

  job.setMapperClass(NNBenchMapper.class);
  job.setReducerClass(NNBenchReducer.class);

  FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME));
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks((int) numberOfReduces);
  JobClient.runJob(job);
}
 
Example #5
Source File: FrameReaderTextCell.java    From systemds with Apache License 2.0 6 votes vote down vote up
protected void readTextCellFrameFromHDFS( Path path, JobConf job, FileSystem fs, FrameBlock dest, 
		ValueType[] schema, String[] names, long rlen, long clen)
	throws IOException
{
	if( fs.isDirectory(path) ) {
		FileInputFormat.addInputPath(job, path);
		TextInputFormat informat = new TextInputFormat();
		informat.configure(job);
		InputSplit[] splits = informat.getSplits(job, 1);
		for(InputSplit split: splits)
			readTextCellFrameFromInputSplit(split, informat, job, dest);
	}
	else {
		readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen);
	}
}
 
Example #6
Source File: Job.java    From RDFS with Apache License 2.0 6 votes vote down vote up
/**
 * Submit this job to mapred. The state becomes RUNNING if submission 
 * is successful, FAILED otherwise.  
 */
protected synchronized void submit() {
  try {
    if (theJobConf.getBoolean("create.empty.dir.if.nonexist", false)) {
      FileSystem fs = FileSystem.get(theJobConf);
      Path inputPaths[] = FileInputFormat.getInputPaths(theJobConf);
      for (int i = 0; i < inputPaths.length; i++) {
        if (!fs.exists(inputPaths[i])) {
          try {
            fs.mkdirs(inputPaths[i]);
          } catch (IOException e) {

          }
        }
      }
    }
    RunningJob running = jc.submitJob(theJobConf);
    this.mapredJobID = running.getID();
    this.state = Job.RUNNING;
  } catch (IOException ioe) {
    this.state = Job.FAILED;
    this.message = StringUtils.stringifyException(ioe);
  }
}
 
Example #7
Source File: HoodieCombineHiveInputFormat.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Override
public RecordReader getRecordReader(JobConf job, CombineFileSplit split, Reporter reporter,
    Class<RecordReader<K, V>> rrClass) throws IOException {
  isRealTime = Boolean.valueOf(job.get("hudi.hive.realtime", "false"));
  if (isRealTime) {
    List<RecordReader> recordReaders = new LinkedList<>();
    ValidationUtils.checkArgument(split instanceof HoodieCombineRealtimeFileSplit, "Only "
        + HoodieCombineRealtimeFileSplit.class.getName() + " allowed, found " + split.getClass().getName());
    for (InputSplit inputSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) {
      if (split.getPaths().length == 0) {
        continue;
      }
      FileInputFormat inputFormat = HoodieInputFormatUtils.getInputFormat(split.getPath(0).toString(), true, job);
      recordReaders.add(inputFormat.getRecordReader(inputSplit, job, reporter));
    }
    return new HoodieCombineRealtimeRecordReader(job, split, recordReaders);
  }
  return new HadoopShimsSecure.CombineFileRecordReader(job, split, reporter, rrClass);
}
 
Example #8
Source File: JobControlTestUtils.java    From hadoop with Apache License 2.0 6 votes vote down vote up
/**
 * Creates a simple copy job.
 * 
 * @param indirs List of input directories.
 * @param outdir Output directory.
 * @return JobConf initialised for a simple copy job.
 * @throws Exception If an error occurs creating job configuration.
 */
static JobConf createCopyJob(List<Path> indirs, Path outdir) throws Exception {

  Configuration defaults = new Configuration();
  JobConf theJob = new JobConf(defaults, TestJobControl.class);
  theJob.setJobName("DataMoveJob");

  FileInputFormat.setInputPaths(theJob, indirs.toArray(new Path[0]));
  theJob.setMapperClass(DataCopy.class);
  FileOutputFormat.setOutputPath(theJob, outdir);
  theJob.setOutputKeyClass(Text.class);
  theJob.setOutputValueClass(Text.class);
  theJob.setReducerClass(DataCopy.class);
  theJob.setNumMapTasks(12);
  theJob.setNumReduceTasks(4);
  return theJob;
}
 
Example #9
Source File: TestFileSystem.java    From RDFS with Apache License 2.0 6 votes vote down vote up
public static void writeTest(FileSystem fs, boolean fastCheck)
  throws Exception {

  fs.delete(DATA_DIR, true);
  fs.delete(WRITE_DIR, true);
  
  JobConf job = new JobConf(conf, TestFileSystem.class);
  job.setBoolean("fs.test.fastCheck", fastCheck);

  FileInputFormat.setInputPaths(job, CONTROL_DIR);
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(WriteMapper.class);
  job.setReducerClass(LongSumReducer.class);

  FileOutputFormat.setOutputPath(job, WRITE_DIR);
  job.setOutputKeyClass(UTF8.class);
  job.setOutputValueClass(LongWritable.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}
 
Example #10
Source File: TestFileSystem.java    From RDFS with Apache License 2.0 6 votes vote down vote up
public static void readTest(FileSystem fs, boolean fastCheck)
  throws Exception {

  fs.delete(READ_DIR, true);

  JobConf job = new JobConf(conf, TestFileSystem.class);
  job.setBoolean("fs.test.fastCheck", fastCheck);


  FileInputFormat.setInputPaths(job, CONTROL_DIR);
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(ReadMapper.class);
  job.setReducerClass(LongSumReducer.class);

  FileOutputFormat.setOutputPath(job, READ_DIR);
  job.setOutputKeyClass(UTF8.class);
  job.setOutputValueClass(LongWritable.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}
 
Example #11
Source File: EthereumFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 6 votes vote down vote up
@Test
 public void readEthereumBlockInputFormatBlock1() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	JobConf job = new JobConf(defaultConf);
			ClassLoader classLoader = getClass().getClassLoader();
String fileName="eth1.bin";
String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
 
   assertEquals( 1, inputSplits.length,"Only one split generated for genesis block");
   	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
BytesWritable key = new BytesWritable();	
EthereumBlock block = new EthereumBlock();
assertTrue( reader.next(key,block),"Input Split for block 1 contains at least one block");
assertEquals( 0, block.getEthereumTransactions().size(),"Block 1 must have 0 transactions");
   	assertFalse( reader.next(key,block),"No further blocks in block 1");
   	reader.close();
}
 
Example #12
Source File: EthereumFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 6 votes vote down vote up
@Test
 public void readEthereumBlockInputFormatBlock1346406() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	JobConf job = new JobConf(defaultConf);
				ClassLoader classLoader = getClass().getClassLoader();
String fileName="eth1346406.bin";
String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
 
   assertEquals( 1, inputSplits.length,"Only one split generated for genesis block");
   	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
		
BytesWritable key = new BytesWritable();	
EthereumBlock block = new EthereumBlock();
assertTrue( reader.next(key,block),"Input Split for block 1346406 contains at least one block");
assertEquals( 6, block.getEthereumTransactions().size(),"Block 1346406 must have 6 transactions");
   	assertFalse( reader.next(key,block),"No further blocks in block 1346406");
   	reader.close();
}
 
Example #13
Source File: LinkDbMerger.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception {
  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("LinkDb merge: starting at " + sdf.format(start));

  JobConf job = createMergeJob(getConf(), output, normalize, filter);
  for (int i = 0; i < dbs.length; i++) {
    FileInputFormat.addInputPath(job, new Path(dbs[i], LinkDb.CURRENT_NAME));      
  }
  JobClient.runJob(job);
  FileSystem fs = FileSystem.get(getConf());
  fs.mkdirs(output);
  fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, LinkDb.CURRENT_NAME));

  long end = System.currentTimeMillis();
  LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
 
Example #14
Source File: PersonVersion.java    From blog with MIT License 6 votes vote down vote up
private static void runJobPv(String inputDir, String outputDir, String jobName, Class<? extends Mapper> mapClass,
                             Class<? extends Reducer> reduceClass) throws Exception {
    JobConf conf = new JobConf(PersonVersion.class);
    conf.setJobName(jobName);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(mapClass);
    conf.setCombinerClass(reduceClass);
    conf.setReducerClass(reduceClass);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, inputDir);
    FileOutputFormat.setOutputPath(conf, new Path(outputDir));

    JobClient.runJob(conf);
}
 
Example #15
Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java    From hadoopoffice with Apache License 2.0 5 votes vote down vote up
@Test
public void readExcelInputFormatExcel2013SingleSheetEncryptedPositiveLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2013encrypt.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "de");
	// low footprint
	job.set("hadoopoffice.read.lowFootprint", "true");

	job.set("hadoopoffice.read.lowFootprint.parser", "stax");
	// for decryption simply set the password
	job.set("hadoopoffice.read.security.crypt.password", "test");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull(reader, "Format returned  null RecordReader");
	Text spreadSheetKey = new Text();
	ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1");
	assertEquals("[excel2013encrypt.xlsx]Sheet1!A1", spreadSheetKey.toString(),
			"Input Split for Excel file has keyname == \"[excel2013encrypt.xlsx]Sheet1!A1\"");
	assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
	assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
	assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(),
			"Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
	assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(),
			"Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
	assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
	assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
}
 
Example #16
Source File: ReaderTextCSVParallel.java    From systemds with Apache License 2.0 5 votes vote down vote up
private void readCSVMatrixFromHDFS(InputSplit[] splits, Path path, JobConf job, 
		MatrixBlock dest, long rlen, long clen, int blen, 
		boolean hasHeader, String delim, boolean fill, double fillValue) 
	throws IOException 
{
	FileInputFormat.addInputPath(job, path);
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);

	ExecutorService pool = CommonThreadPool.get(_numThreads);

	try 
	{
		// create read tasks for all splits
		ArrayList<CSVReadTask> tasks = new ArrayList<>();
		int splitCount = 0;
		for (InputSplit split : splits) {
			tasks.add( new CSVReadTask(split, _offsets, informat, job, dest, 
				rlen, clen, hasHeader, delim, fill, fillValue, splitCount++) );
		}
		pool.invokeAll(tasks);
		pool.shutdown();

		// check return codes and aggregate nnz
		long lnnz = 0;
		for (CSVReadTask rt : tasks) {
			lnnz += rt.getPartialNnz();
			if (!rt.getReturnCode()) {
				Exception err = rt.getException();
				throw new IOException("Read task for csv input failed: "+ err.toString(), err);
			}
		}
		dest.setNonZeros(lnnz);
	} 
	catch (Exception e) {
		throw new IOException("Threadpool issue, while parallel read.", e);
	}
}
 
Example #17
Source File: EthereumFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 5 votes vote down vote up
@Test
public void readEthereumBlockInputFormatBlock403419() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName="block403419.bin";
	String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
	Path file = new Path(fileNameBlock);
    FileInputFormat.setInputPaths(job, file);
    EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
    format.configure(job);
    InputSplit[] inputSplits = format.getSplits(job,1);
  
    assertEquals( 1, inputSplits.length,"Only one split generated for block 403419");
    	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull( reader,"Format returned  null RecordReader");
	BytesWritable key = new BytesWritable();	
	EthereumBlock block = new EthereumBlock();
	assertTrue( reader.next(key,block),"Input Split for block 403419 contains at least one block");
	assertEquals( 2, block.getEthereumTransactions().size(),"Block 403419 must have 2 transactions");
	EthereumBlockHeader ethereumBlockHeader = block.getEthereumBlockHeader();
	assertEquals(
			"f8b483dba2c3b7176a3da549ad41a48bb3121069",
			bytesToHex(ethereumBlockHeader.getCoinBase()).toLowerCase(),
			"Block 403419 was mined by f8b483dba2c3b7176a3da549ad41a48bb3121069"
	);
	assertEquals(
			"08741fa532c05804d9c1086a311e47cc024bbc43980f561041ad1fbb3c223322",
			bytesToHex(ethereumBlockHeader.getParentHash()).toLowerCase(),
			"The parent of block 403419 has hash 08741fa532c05804d9c1086a311e47cc024bbc43980f561041ad1fbb3c223322"
	);
    	assertFalse( reader.next(key,block),"No further lock 403419  in genesis Block");
    	
    	reader.close();
	
}
 
Example #18
Source File: TestCopyOnWriteActionExecutor.java    From hudi with Apache License 2.0 5 votes vote down vote up
private FileStatus[] getIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull)
        throws Exception {
  // initialize parquet input format
  HoodieParquetInputFormat hoodieInputFormat = new HoodieParquetInputFormat();
  JobConf jobConf = new JobConf(hadoopConf);
  hoodieInputFormat.setConf(jobConf);
  HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE);
  setupIncremental(jobConf, startCommitTime, numCommitsToPull);
  FileInputFormat.setInputPaths(jobConf, Paths.get(basePath, partitionPath).toString());
  return hoodieInputFormat.listStatus(jobConf);
}
 
Example #19
Source File: MapUtils.java    From tez with Apache License 2.0 5 votes vote down vote up
private static InputSplit 
createInputSplit(FileSystem fs, Path workDir, JobConf job, Path file, int numKVs)
    throws IOException {
  FileInputFormat.setInputPaths(job, workDir);

  LOG.info("Generating data at path: " + file);
  // create a file with length entries
  @SuppressWarnings("deprecation")
  SequenceFile.Writer writer = 
      SequenceFile.createWriter(fs, job, file, 
          LongWritable.class, Text.class);
  try {
    Random r = new Random(System.currentTimeMillis());
    LongWritable key = new LongWritable();
    Text value = new Text();
    for (int i = numKVs; i > 0; i--) {
      key.set(r.nextInt(1000));
      value.set(Integer.toString(i));
      writer.append(key, value);
      LOG.info("<k, v> : <" + key.get() + ", " + value + ">");
    }
  } finally {
    writer.close();
  }
  
  SequenceFileInputFormat<LongWritable, Text> format = 
      new SequenceFileInputFormat<LongWritable, Text>();
  InputSplit[] splits = format.getSplits(job, 1);
  System.err.println("#split = " + splits.length + " ; " +
      "#locs = " + splits[0].getLocations().length + "; " +
      "loc = " + splits[0].getLocations()[0] + "; " + 
      "off = " + splits[0].getLength() + "; " +
      "file = " + ((FileSplit)splits[0]).getPath());
  return splits[0];
}
 
Example #20
Source File: OfficeFormatHadoopExcelLowFootPrintSAXTest.java    From hadoopoffice with Apache License 2.0 5 votes vote down vote up
@Test
public void readExcelInputFormatExcel2003SingleSheetEncryptedPositiveLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2003encrypt.xls";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "de");

	// low footprint
	job.set("hadoopoffice.read.lowFootprint", "true");
	// for decryption simply set the password
	job.set("hadoopoffice.read.security.crypt.password", "test");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull(reader, "Format returned  null RecordReader");
	Text spreadSheetKey = new Text();
	ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1");
	assertEquals("[excel2003encrypt.xls]Sheet1!A1", spreadSheetKey.toString(),
			"Input Split for Excel file has keyname == \"[excel2003encrypt.xls]Sheet1!A1\"");
	assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
	assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
	assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(),
			"Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
	assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(),
			"Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
	assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
	assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
}
 
Example #21
Source File: TestHoodieParquetInputFormat.java    From hudi with Apache License 2.0 5 votes vote down vote up
@Test
public void testInputFormatLoad() throws IOException {
  // initial commit
  File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
  InputFormatTestUtil.commit(basePath, "100");

  // Add the paths
  FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

  InputSplit[] inputSplits = inputFormat.getSplits(jobConf, 10);
  assertEquals(10, inputSplits.length);

  FileStatus[] files = inputFormat.listStatus(jobConf);
  assertEquals(10, files.length);
}
 
Example #22
Source File: TestHoodieParquetInputFormat.java    From hudi with Apache License 2.0 5 votes vote down vote up
@Test
public void testIncrementalSimple() throws IOException {
  // initial commit
  File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
  createCommitFile(basePath, "100", "2016/05/01");

  // Add the paths
  FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

  InputFormatTestUtil.setupIncremental(jobConf, "100", 1);

  FileStatus[] files = inputFormat.listStatus(jobConf);
  assertEquals(0, files.length,
      "We should exclude commit 100 when returning incremental pull with start commit time as 100");
}
 
Example #23
Source File: LinkRank.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * Runs the initializer job. The initializer job sets up the nodes with a
 * default starting score for link analysis.
 * 
 * @param nodeDb The node database to use.
 * @param output The job output directory.
 * 
 * @throws IOException If an error occurs while running the initializer job.
 */
private void runInitializer(Path nodeDb, Path output)
  throws IOException {

  // configure the initializer
  JobConf initializer = new NutchJob(getConf());
  initializer.setJobName("LinkAnalysis Initializer");
  FileInputFormat.addInputPath(initializer, nodeDb);
  FileOutputFormat.setOutputPath(initializer, output);
  initializer.setInputFormat(SequenceFileInputFormat.class);
  initializer.setMapperClass(Initializer.class);
  initializer.setMapOutputKeyClass(Text.class);
  initializer.setMapOutputValueClass(Node.class);
  initializer.setOutputKeyClass(Text.class);
  initializer.setOutputValueClass(Node.class);
  initializer.setOutputFormat(MapFileOutputFormat.class);
  initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  // run the initializer
  LOG.info("Starting initialization job");
  try {
    JobClient.runJob(initializer);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  LOG.info("Finished initialization job.");
}
 
Example #24
Source File: EthereumFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 5 votes vote down vote up
@Test
 public void readEthereumBlockInputFormatBlock0to10() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
JobConf job = new JobConf(defaultConf);
ClassLoader classLoader = getClass().getClassLoader();
String fileName="eth0to10.bin";
String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
 
   assertEquals( 1, inputSplits.length,"Only one split generated for genesis block");
   	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
BytesWritable key = new BytesWritable();	
EthereumBlock block = new EthereumBlock();
int count=0;
while (count<11) {
	if (reader.next(key,block)) {
		count++;
	}
}
assertEquals(11,count,"Block 0..10 contains 11 blocks");

   	assertFalse( reader.next(key,block),"No further blocks in block 0..10");
   	reader.close();
}
 
Example #25
Source File: FrameReaderTextCSV.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Override
public final FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] names,
		long rlen, long clen)
	throws IOException, DMLRuntimeException 
{
	//prepare file access
	JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());	
	Path path = new Path( fname );
	FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
	FileInputFormat.addInputPath(job, path);
	
	//check existence and non-empty file
	checkValidInputFile(fs, path); 
	
	//compute size if necessary
	if( rlen <= 0 || clen <= 0 ) {
		Pair<Integer,Integer> size = computeCSVSize(path, job, fs);
		rlen = size.getKey();
		clen = size.getValue();
	}
	
	//allocate output frame block
	ValueType[] lschema = createOutputSchema(schema, clen);
	String[] lnames = createOutputNames(names, clen);
	FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen);

	//core read (sequential/parallel) 
	readCSVFrameFromHDFS(path, job, fs, ret, lschema, lnames, rlen, clen);
	
	return ret;
}
 
Example #26
Source File: TestMRAppWithCombiner.java    From big-c with Apache License 2.0 5 votes vote down vote up
@Test
public void testCombinerShouldUpdateTheReporter() throws Exception {
  JobConf conf = new JobConf(mrCluster.getConfig());
  int numMaps = 5;
  int numReds = 2;
  Path in = new Path(mrCluster.getTestWorkDir().getAbsolutePath(),
      "testCombinerShouldUpdateTheReporter-in");
  Path out = new Path(mrCluster.getTestWorkDir().getAbsolutePath(),
      "testCombinerShouldUpdateTheReporter-out");
  createInputOutPutFolder(in, out, numMaps);
  conf.setJobName("test-job-with-combiner");
  conf.setMapperClass(IdentityMapper.class);
  conf.setCombinerClass(MyCombinerToCheckReporter.class);
  //conf.setJarByClass(MyCombinerToCheckReporter.class);
  conf.setReducerClass(IdentityReducer.class);
  DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf);
  conf.setOutputCommitter(CustomOutputCommitter.class);
  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputKeyClass(LongWritable.class);
  conf.setOutputValueClass(Text.class);

  FileInputFormat.setInputPaths(conf, in);
  FileOutputFormat.setOutputPath(conf, out);
  conf.setNumMapTasks(numMaps);
  conf.setNumReduceTasks(numReds);
  
  runJob(conf);
}
 
Example #27
Source File: TestInputOutputFormat.java    From hive-dwrf with Apache License 2.0 5 votes vote down vote up
@Test
public void testEmptyFile() throws Exception {
  JobConf job = new JobConf(conf);
  Properties properties = new Properties();
  HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
  FileSinkOperator.RecordWriter writer =
      outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true,
          properties, Reporter.NULL);
  writer.close(true);
  properties.setProperty("columns", "x,y");
  properties.setProperty("columns.types", "int:int");
  SerDe serde = new OrcSerde();
  serde.initialize(conf, properties);
  InputFormat<?,?> in = new OrcInputFormat();
  FileInputFormat.setInputPaths(conf, testFilePath.toString());
  InputSplit[] splits = in.getSplits(conf, 1);
  assertEquals(1, splits.length);

  // read the whole file
  conf.set("hive.io.file.readcolumn.ids", "0,1");
  org.apache.hadoop.mapred.RecordReader reader =
      in.getRecordReader(splits[0], conf, Reporter.NULL);
  Object key = reader.createKey();
  Object value = reader.createValue();
  assertEquals(0.0, reader.getProgress(), 0.00001);
  assertEquals(0, reader.getPos());
  assertEquals(false, reader.next(key, value));
  reader.close();
  assertEquals(null, serde.getSerDeStats());
}
 
Example #28
Source File: LinkRank.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Runs the inverter job. The inverter job flips outlinks to inlinks to be
 * passed into the analysis job.
 * 
 * The inverter job takes a link loops database if it exists. It is an
 * optional componenet of link analysis due to its extreme computational and
 * space requirements but it can be very useful is weeding out and eliminating
 * link farms and other spam pages.
 * 
 * @param nodeDb The node database to use.
 * @param outlinkDb The outlink database to use.
 * @param loopDb The loop database to use if it exists.
 * @param output The output directory.
 * 
 * @throws IOException If an error occurs while running the inverter job.
 */
private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output)
  throws IOException {

  // configure the inverter
  JobConf inverter = new NutchJob(getConf());
  inverter.setJobName("LinkAnalysis Inverter");
  FileInputFormat.addInputPath(inverter, nodeDb);
  FileInputFormat.addInputPath(inverter, outlinkDb);

  // add the loop database if it exists, isn't null
  if (loopDb != null) {
    FileInputFormat.addInputPath(inverter, loopDb);
  }
  FileOutputFormat.setOutputPath(inverter, output);
  inverter.setInputFormat(SequenceFileInputFormat.class);
  inverter.setMapperClass(Inverter.class);
  inverter.setReducerClass(Inverter.class);
  inverter.setMapOutputKeyClass(Text.class);
  inverter.setMapOutputValueClass(ObjectWritable.class);
  inverter.setOutputKeyClass(Text.class);
  inverter.setOutputValueClass(LinkDatum.class);
  inverter.setOutputFormat(SequenceFileOutputFormat.class);
  inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  // run the inverter job
  LOG.info("Starting inverter job");
  try {
    JobClient.runJob(inverter);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  LOG.info("Finished inverter job.");
}
 
Example #29
Source File: EmoInputFormat.java    From emodb with Apache License 2.0 5 votes vote down vote up
@Override
public InputSplit[] getSplits(JobConf job, int numSplits)
        throws IOException {
    Path[] paths = FileInputFormat.getInputPaths(job);

    return FluentIterable.from(BaseInputFormat.getSplits(job, paths))
            .transform(_fromSplit)
            .toArray(InputSplit.class);
}
 
Example #30
Source File: AvroUtils.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
/**
 * Obtain the avro input schema from data
 * @param conf
 * @return
 * @throws IOException
 */
public static Schema getAvroInputSchema(JobConf conf) throws IOException
{
  Path[] paths = FileInputFormat.getInputPaths(conf);
  if (paths == null)
  {
    throw new IllegalStateException("input paths do not exist in jobConf!");
  }
  Schema inputSchema = AvroUtils.getSchemaFromFile(conf, paths[0]);
  if (inputSchema == null)
  {
    throw new IllegalStateException("Input does not have schema info and/or input is missing.");
  }
  return inputSchema;
}