org.apache.hadoop.mapred.SequenceFileInputFormat Java Examples

The following examples show how to use org.apache.hadoop.mapred.SequenceFileInputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NNBench.java    From hadoop with Apache License 2.0 8 votes vote down vote up
/**
 * Run the test
 * 
 * @throws IOException on error
 */
public static void runTests() throws IOException {
  config.setLong("io.bytes.per.checksum", bytesPerChecksum);
  
  JobConf job = new JobConf(config, NNBench.class);

  job.setJobName("NNBench-" + operation);
  FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  
  // Explicitly set number of max map attempts to 1.
  job.setMaxMapAttempts(1);
  
  // Explicitly turn off speculative execution
  job.setSpeculativeExecution(false);

  job.setMapperClass(NNBenchMapper.class);
  job.setReducerClass(NNBenchReducer.class);

  FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME));
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks((int) numberOfReduces);
  JobClient.runJob(job);
}
 
Example #2
Source File: SolrClean.java    From anthelion with Apache License 2.0 7 votes vote down vote up
public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException {
  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("SolrClean: starting at " + sdf.format(start));

  JobConf job = new NutchJob(getConf());

  FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
  job.setBoolean("noCommit", noCommit);
  job.set(SolrConstants.SERVER_URL, solrUrl);
  job.setInputFormat(SequenceFileInputFormat.class);
  job.setOutputFormat(NullOutputFormat.class);
  job.setMapOutputKeyClass(ByteWritable.class);
  job.setMapOutputValueClass(Text.class);
  job.setMapperClass(DBFilter.class);
  job.setReducerClass(SolrDeleter.class);

  JobClient.runJob(job);

  long end = System.currentTimeMillis();
  LOG.info("SolrClean: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
 
Example #3
Source File: NNBench.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
/**
 * Run the test
 * 
 * @throws IOException on error
 */
public static void runTests() throws IOException {
  config.setLong("io.bytes.per.checksum", bytesPerChecksum);
  
  JobConf job = new JobConf(config, NNBench.class);

  job.setJobName("NNBench-" + operation);
  FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  
  // Explicitly set number of max map attempts to 1.
  job.setMaxMapAttempts(1);
  
  // Explicitly turn off speculative execution
  job.setSpeculativeExecution(false);

  job.setMapperClass(NNBenchMapper.class);
  job.setReducerClass(NNBenchReducer.class);

  FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME));
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks((int) numberOfReduces);
  JobClient.runJob(job);
}
 
Example #4
Source File: TestDatamerge.java    From big-c with Apache License 2.0 6 votes vote down vote up
private static void joinAs(String jointype,
    Class<? extends SimpleCheckerBase> c) throws Exception {
  final int srcs = 4;
  Configuration conf = new Configuration();
  JobConf job = new JobConf(conf, c);
  Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype));
  Path[] src = writeSimpleSrc(base, conf, srcs);
  job.set("mapreduce.join.expr", CompositeInputFormat.compose(jointype,
      SequenceFileInputFormat.class, src));
  job.setInt("testdatamerge.sources", srcs);
  job.setInputFormat(CompositeInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(base, "out"));

  job.setMapperClass(c);
  job.setReducerClass(c);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(IntWritable.class);
  JobClient.runJob(job);
  base.getFileSystem(job).delete(base, true);
}
 
Example #5
Source File: TestDFSIO.java    From big-c with Apache License 2.0 6 votes vote down vote up
private void runIOTest(
        Class<? extends Mapper<Text, LongWritable, Text, Text>> mapperClass, 
        Path outputDir) throws IOException {
  JobConf job = new JobConf(config, TestDFSIO.class);

  FileInputFormat.setInputPaths(job, getControlDir(config));
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(mapperClass);
  job.setReducerClass(AccumulatingReducer.class);

  FileOutputFormat.setOutputPath(job, outputDir);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}
 
Example #6
Source File: NNBench.java    From big-c with Apache License 2.0 6 votes vote down vote up
/**
 * Run the test
 * 
 * @throws IOException on error
 */
public static void runTests() throws IOException {
  config.setLong("io.bytes.per.checksum", bytesPerChecksum);
  
  JobConf job = new JobConf(config, NNBench.class);

  job.setJobName("NNBench-" + operation);
  FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  
  // Explicitly set number of max map attempts to 1.
  job.setMaxMapAttempts(1);
  
  // Explicitly turn off speculative execution
  job.setSpeculativeExecution(false);

  job.setMapperClass(NNBenchMapper.class);
  job.setReducerClass(NNBenchReducer.class);

  FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME));
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks((int) numberOfReduces);
  JobClient.runJob(job);
}
 
Example #7
Source File: DirectoryIngestMapperTest.java    From hadoop-solr with Apache License 2.0 6 votes vote down vote up
@Before
public void setUp() throws Exception {
    conf = getDefaultDirectoryIngestMapperConfiguration();
    Path dir = new Path(fs.getWorkingDirectory(), "build");
    Path sub = new Path(dir, "DIMT");
    Path tempDir = new Path(sub, "tmp-dir");
    Path seqDir = new Path(sub, "seq-dir");// this is the location where the
    // fixture will write inputs.seq
    fs.mkdirs(tempDir);
    tempFiles = setupDir(fs, tempDir);
    conf.set(TEMP_DIR, seqDir.toString());
    jobConf = new JobConf(conf);
    jobConf.setMapperClass(DirectoryIngestMapper.class);
    jobConf.setInputFormat(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(jobConf, OUTPUT_DIRECTORY_PATH);
    org.apache.hadoop.mapred.FileInputFormat.setInputPaths(jobConf, new Path(tempDir, "*"));
    Path[] paths = org.apache.hadoop.mapred.FileInputFormat.getInputPaths(jobConf);
    assertEquals(1, paths.length);
}
 
Example #8
Source File: TestDatamerge.java    From hadoop with Apache License 2.0 6 votes vote down vote up
private static void joinAs(String jointype,
    Class<? extends SimpleCheckerBase> c) throws Exception {
  final int srcs = 4;
  Configuration conf = new Configuration();
  JobConf job = new JobConf(conf, c);
  Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype));
  Path[] src = writeSimpleSrc(base, conf, srcs);
  job.set("mapreduce.join.expr", CompositeInputFormat.compose(jointype,
      SequenceFileInputFormat.class, src));
  job.setInt("testdatamerge.sources", srcs);
  job.setInputFormat(CompositeInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(base, "out"));

  job.setMapperClass(c);
  job.setReducerClass(c);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(IntWritable.class);
  JobClient.runJob(job);
  base.getFileSystem(job).delete(base, true);
}
 
Example #9
Source File: SequenceFileIngestMapperTest.java    From hadoop-solr with Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws Exception {
    prepareFrankensteinSeqFileInput();
    Configuration conf = getDefaultSequenceFileIngestMapperConfiguration();
    Job job = createJobBasedOnConfiguration(conf, SequenceFileIngestMapper.class);
    ((JobConf)job.getConfiguration()).setInputFormat(SequenceFileInputFormat.class);

    List<String> results = runJobSuccessfully(job, 776);

    assertNumDocsProcessed(job, 776);
    assertEquals(776, results.size());
    for (String docStr : results) {
        assertNotNull(docStr);
    }

}
 
Example #10
Source File: LinkDbMerger.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public static JobConf createMergeJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
  Path newLinkDb =
    new Path("linkdb-merge-" + 
             Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

  JobConf job = new NutchJob(config);
  job.setJobName("linkdb merge " + linkDb);

  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(LinkDbFilter.class);
  job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
  job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
  job.setReducerClass(LinkDbMerger.class);

  FileOutputFormat.setOutputPath(job, newLinkDb);
  job.setOutputFormat(MapFileOutputFormat.class);
  job.setBoolean("mapred.output.compress", true);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Inlinks.class);

  // https://issues.apache.org/jira/browse/NUTCH-1069
  job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  return job;
}
 
Example #11
Source File: TestFileSystem.java    From RDFS with Apache License 2.0 6 votes vote down vote up
public static void writeTest(FileSystem fs, boolean fastCheck)
  throws Exception {

  fs.delete(DATA_DIR, true);
  fs.delete(WRITE_DIR, true);
  
  JobConf job = new JobConf(conf, TestFileSystem.class);
  job.setBoolean("fs.test.fastCheck", fastCheck);

  FileInputFormat.setInputPaths(job, CONTROL_DIR);
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(WriteMapper.class);
  job.setReducerClass(LongSumReducer.class);

  FileOutputFormat.setOutputPath(job, WRITE_DIR);
  job.setOutputKeyClass(UTF8.class);
  job.setOutputValueClass(LongWritable.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}
 
Example #12
Source File: TestFileSystem.java    From RDFS with Apache License 2.0 6 votes vote down vote up
public static void readTest(FileSystem fs, boolean fastCheck)
  throws Exception {

  fs.delete(READ_DIR, true);

  JobConf job = new JobConf(conf, TestFileSystem.class);
  job.setBoolean("fs.test.fastCheck", fastCheck);


  FileInputFormat.setInputPaths(job, CONTROL_DIR);
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(ReadMapper.class);
  job.setReducerClass(LongSumReducer.class);

  FileOutputFormat.setOutputPath(job, READ_DIR);
  job.setOutputKeyClass(UTF8.class);
  job.setOutputValueClass(LongWritable.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}
 
Example #13
Source File: TestDatamerge.java    From RDFS with Apache License 2.0 6 votes vote down vote up
private static void joinAs(String jointype,
    Class<? extends SimpleCheckerBase> c) throws Exception {
  final int srcs = 4;
  Configuration conf = new Configuration();
  JobConf job = new JobConf(conf, c);
  Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype));
  Path[] src = writeSimpleSrc(base, conf, srcs);
  job.set("mapred.join.expr", CompositeInputFormat.compose(jointype,
      SequenceFileInputFormat.class, src));
  job.setInt("testdatamerge.sources", srcs);
  job.setInputFormat(CompositeInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(base, "out"));

  job.setMapperClass(c);
  job.setReducerClass(c);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(IntWritable.class);
  JobClient.runJob(job);
  base.getFileSystem(job).delete(base, true);
}
 
Example #14
Source File: NNBench.java    From RDFS with Apache License 2.0 6 votes vote down vote up
/**
 * Run the test
 * 
 * @throws IOException on error
 */
public static void runTests(Configuration config) throws IOException {
  config.setLong("io.bytes.per.checksum", bytesPerChecksum);
  
  JobConf job = new JobConf(config, NNBench.class);

  job.setJobName("NNBench-" + operation);
  FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  
  // Explicitly set number of max map attempts to 1.
  job.setMaxMapAttempts(1);
  
  // Explicitly turn off speculative execution
  job.setSpeculativeExecution(false);

  job.setMapperClass(NNBenchMapper.class);
  job.setReducerClass(NNBenchReducer.class);

  FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME));
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks((int) numberOfReduces);
  JobClient.runJob(job);
}
 
Example #15
Source File: DFSGeneralTest.java    From RDFS with Apache License 2.0 6 votes vote down vote up
private void updateJobConf(JobConf conf, Path inputPath, Path outputPath) {
  // set specific job config
  conf.setLong(NUMBER_OF_MAPS_KEY, nmaps);
  conf.setLong(NUMBER_OF_THREADS_KEY, nthreads);
  conf.setInt(BUFFER_SIZE_KEY, buffersize);
  conf.setLong(WRITER_DATARATE_KEY, datarate);
  conf.setLong("mapred.task.timeout", Long.MAX_VALUE);
  conf.set(OUTPUT_DIR_KEY, output);
  
  // set the output and input for the map reduce
  FileInputFormat.setInputPaths(conf, inputPath);
  FileOutputFormat.setOutputPath(conf, outputPath);

  conf.setInputFormat(SequenceFileInputFormat.class);
  conf.setOutputKeyClass(Text.class);
  conf.setOutputValueClass(Text.class);
  conf.setNumReduceTasks(1);
  conf.setSpeculativeExecution(false);
}
 
Example #16
Source File: LinkDbMerger.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public static JobConf createMergeJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
  Path newLinkDb =
    new Path("linkdb-merge-" + 
             Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

  JobConf job = new NutchJob(config);
  job.setJobName("linkdb merge " + linkDb);

  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(LinkDbFilter.class);
  job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
  job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
  job.setReducerClass(LinkDbMerger.class);

  FileOutputFormat.setOutputPath(job, newLinkDb);
  job.setOutputFormat(MapFileOutputFormat.class);
  job.setBoolean("mapred.output.compress", true);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Inlinks.class);

  // https://issues.apache.org/jira/browse/NUTCH-1069
  job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  return job;
}
 
Example #17
Source File: TestMultiMRInput.java    From tez with Apache License 2.0 6 votes vote down vote up
@Test(timeout = 5000)
public void test0PhysicalInputs() throws Exception {

  Path workDir = new Path(TEST_ROOT_DIR, "testSingleSplit");
  JobConf jobConf = new JobConf(defaultConf);
  jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
  FileInputFormat.setInputPaths(jobConf, workDir);

  InputContext inputContext = createTezInputContext(jobConf, new Configuration(false));

  MultiMRInput mMrInput = new MultiMRInput(inputContext, 0);

  mMrInput.initialize();

  mMrInput.start();

  assertEquals(0, mMrInput.getKeyValueReaders().size());

  List<Event> events = new LinkedList<>();
  try {
    mMrInput.handleEvents(events);
    fail("HandleEvents should cause an input with 0 physical inputs to fail");
  } catch (Exception e) {
    assertTrue(e instanceof IllegalStateException);
  }
}
 
Example #18
Source File: TestFileSystem.java    From RDFS with Apache License 2.0 6 votes vote down vote up
public static void seekTest(FileSystem fs, boolean fastCheck)
  throws Exception {

  fs.delete(READ_DIR, true);

  JobConf job = new JobConf(conf, TestFileSystem.class);
  job.setBoolean("fs.test.fastCheck", fastCheck);

  FileInputFormat.setInputPaths(job,CONTROL_DIR);
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(SeekMapper.class);
  job.setReducerClass(LongSumReducer.class);

  FileOutputFormat.setOutputPath(job, READ_DIR);
  job.setOutputKeyClass(UTF8.class);
  job.setOutputValueClass(LongWritable.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}
 
Example #19
Source File: TestDFSIO.java    From hadoop with Apache License 2.0 6 votes vote down vote up
private void runIOTest(
        Class<? extends Mapper<Text, LongWritable, Text, Text>> mapperClass, 
        Path outputDir) throws IOException {
  JobConf job = new JobConf(config, TestDFSIO.class);

  FileInputFormat.setInputPaths(job, getControlDir(config));
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(mapperClass);
  job.setReducerClass(AccumulatingReducer.class);

  FileOutputFormat.setOutputPath(job, outputDir);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}
 
Example #20
Source File: TestDatamerge.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
private static void joinAs(String jointype,
    Class<? extends SimpleCheckerBase> c) throws Exception {
  final int srcs = 4;
  Configuration conf = new Configuration();
  JobConf job = new JobConf(conf, c);
  Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype));
  Path[] src = writeSimpleSrc(base, conf, srcs);
  job.set("mapred.join.expr", CompositeInputFormat.compose(jointype,
      SequenceFileInputFormat.class, src));
  job.setInt("testdatamerge.sources", srcs);
  job.setInputFormat(CompositeInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(base, "out"));

  job.setMapperClass(c);
  job.setReducerClass(c);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(IntWritable.class);
  JobClient.runJob(job);
  base.getFileSystem(job).delete(base, true);
}
 
Example #21
Source File: TestFileSystem.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
public static void seekTest(FileSystem fs, boolean fastCheck)
  throws Exception {

  fs.delete(READ_DIR, true);

  JobConf job = new JobConf(conf, TestFileSystem.class);
  job.setBoolean("fs.test.fastCheck", fastCheck);

  FileInputFormat.setInputPaths(job,CONTROL_DIR);
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(SeekMapper.class);
  job.setReducerClass(LongSumReducer.class);

  FileOutputFormat.setOutputPath(job, READ_DIR);
  job.setOutputKeyClass(UTF8.class);
  job.setOutputValueClass(LongWritable.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}
 
Example #22
Source File: TestFileSystem.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
public static void writeTest(FileSystem fs, boolean fastCheck)
  throws Exception {

  fs.delete(DATA_DIR, true);
  fs.delete(WRITE_DIR, true);
  
  JobConf job = new JobConf(conf, TestFileSystem.class);
  job.setBoolean("fs.test.fastCheck", fastCheck);

  FileInputFormat.setInputPaths(job, CONTROL_DIR);
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(WriteMapper.class);
  job.setReducerClass(LongSumReducer.class);

  FileOutputFormat.setOutputPath(job, WRITE_DIR);
  job.setOutputKeyClass(UTF8.class);
  job.setOutputValueClass(LongWritable.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}
 
Example #23
Source File: TestFileSystem.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
public static void readTest(FileSystem fs, boolean fastCheck)
  throws Exception {

  fs.delete(READ_DIR, true);

  JobConf job = new JobConf(conf, TestFileSystem.class);
  job.setBoolean("fs.test.fastCheck", fastCheck);


  FileInputFormat.setInputPaths(job, CONTROL_DIR);
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(ReadMapper.class);
  job.setReducerClass(LongSumReducer.class);

  FileOutputFormat.setOutputPath(job, READ_DIR);
  job.setOutputKeyClass(UTF8.class);
  job.setOutputValueClass(LongWritable.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}
 
Example #24
Source File: LinkRank.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Runs the inverter job. The inverter job flips outlinks to inlinks to be
 * passed into the analysis job.
 * 
 * The inverter job takes a link loops database if it exists. It is an
 * optional componenet of link analysis due to its extreme computational and
 * space requirements but it can be very useful is weeding out and eliminating
 * link farms and other spam pages.
 * 
 * @param nodeDb The node database to use.
 * @param outlinkDb The outlink database to use.
 * @param loopDb The loop database to use if it exists.
 * @param output The output directory.
 * 
 * @throws IOException If an error occurs while running the inverter job.
 */
private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output)
  throws IOException {

  // configure the inverter
  JobConf inverter = new NutchJob(getConf());
  inverter.setJobName("LinkAnalysis Inverter");
  FileInputFormat.addInputPath(inverter, nodeDb);
  FileInputFormat.addInputPath(inverter, outlinkDb);

  // add the loop database if it exists, isn't null
  if (loopDb != null) {
    FileInputFormat.addInputPath(inverter, loopDb);
  }
  FileOutputFormat.setOutputPath(inverter, output);
  inverter.setInputFormat(SequenceFileInputFormat.class);
  inverter.setMapperClass(Inverter.class);
  inverter.setReducerClass(Inverter.class);
  inverter.setMapOutputKeyClass(Text.class);
  inverter.setMapOutputValueClass(ObjectWritable.class);
  inverter.setOutputKeyClass(Text.class);
  inverter.setOutputValueClass(LinkDatum.class);
  inverter.setOutputFormat(SequenceFileOutputFormat.class);
  inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  // run the inverter job
  LOG.info("Starting inverter job");
  try {
    JobClient.runJob(inverter);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  LOG.info("Finished inverter job.");
}
 
Example #25
Source File: CleaningJob.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void delete(String crawldb, boolean noCommit) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("CleaningJob: starting at " + sdf.format(start));

    JobConf job = new NutchJob(getConf());

    FileInputFormat.addInputPath(job, new Path(crawldb,
            CrawlDb.CURRENT_NAME));
    job.setBoolean("noCommit", noCommit);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setMapOutputKeyClass(ByteWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapperClass(DBFilter.class);
    job.setReducerClass(DeleterReducer.class);
    
    job.setJobName("CleaningJob");

    // need to expicitely allow deletions
    job.setBoolean(IndexerMapReduce.INDEXER_DELETE, true);

    JobClient.runJob(job);

    long end = System.currentTimeMillis();
    LOG.info("CleaningJob: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
}
 
Example #26
Source File: TestMultiMRInput.java    From tez with Apache License 2.0 5 votes vote down vote up
@Test(timeout = 5000)
public void testSingleSplit() throws Exception {

  Path workDir = new Path(TEST_ROOT_DIR, "testSingleSplit");
  JobConf jobConf = new JobConf(defaultConf);
  jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
  FileInputFormat.setInputPaths(jobConf, workDir);

  InputContext inputContext = createTezInputContext(jobConf, new Configuration(false));

  MultiMRInput input = new MultiMRInput(inputContext, 1);
  input.initialize();

  AtomicLong inputLength = new AtomicLong();
  LinkedHashMap<LongWritable, Text> data = createSplits(1, workDir, jobConf, inputLength);

  SequenceFileInputFormat<LongWritable, Text> format =
      new SequenceFileInputFormat<LongWritable, Text>();
  InputSplit[] splits = format.getSplits(jobConf, 1);
  assertEquals(1, splits.length);

  MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]);
  InputDataInformationEvent event =
      InputDataInformationEvent.createWithSerializedPayload(0,
          splitProto.toByteString().asReadOnlyByteBuffer());

  List<Event> eventList = new ArrayList<Event>();
  eventList.add(event);
  input.handleEvents(eventList);

  assertReaders(input, data, 1, inputLength.get());
}
 
Example #27
Source File: TestMultiMRInput.java    From tez with Apache License 2.0 5 votes vote down vote up
@Test(timeout = 5000)
public void testMultipleSplits() throws Exception {

  Path workDir = new Path(TEST_ROOT_DIR, "testMultipleSplits");
  JobConf jobConf = new JobConf(defaultConf);
  jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
  FileInputFormat.setInputPaths(jobConf, workDir);

  InputContext inputContext = createTezInputContext(jobConf, new Configuration(false));

  MultiMRInput input = new MultiMRInput(inputContext, 2);
  input.initialize();

  AtomicLong inputLength = new AtomicLong();
  LinkedHashMap<LongWritable, Text> data = createSplits(2, workDir, jobConf, inputLength);

  SequenceFileInputFormat<LongWritable, Text> format =
      new SequenceFileInputFormat<LongWritable, Text>();
  InputSplit[] splits = format.getSplits(jobConf, 2);
  assertEquals(2, splits.length);

  MRSplitProto splitProto1 = MRInputHelpers.createSplitProto(splits[0]);
  InputDataInformationEvent event1 =
      InputDataInformationEvent.createWithSerializedPayload(0,
          splitProto1.toByteString().asReadOnlyByteBuffer());

  MRSplitProto splitProto2 = MRInputHelpers.createSplitProto(splits[1]);
  InputDataInformationEvent event2 =
      InputDataInformationEvent.createWithSerializedPayload(0,
          splitProto2.toByteString().asReadOnlyByteBuffer());

  List<Event> eventList = new ArrayList<Event>();
  eventList.add(event1);
  eventList.add(event2);
  input.handleEvents(eventList);

  assertReaders(input, data, 2, inputLength.get());
}
 
Example #28
Source File: TestStreamJob.java    From big-c with Apache License 2.0 5 votes vote down vote up
@Test
public void testCreateJob() throws IOException {
  JobConf job;
  ArrayList<String> dummyArgs = new ArrayList<String>();
  dummyArgs.add("-input"); dummyArgs.add("dummy");
  dummyArgs.add("-output"); dummyArgs.add("dummy");
  dummyArgs.add("-mapper"); dummyArgs.add("dummy");
  dummyArgs.add("-reducer"); dummyArgs.add("dummy");
  ArrayList<String> args;
  
  args = new ArrayList<String>(dummyArgs);
  args.add("-inputformat");
  args.add("org.apache.hadoop.mapred.KeyValueTextInputFormat");
  job = StreamJob.createJob(args.toArray(new String[] {}));
  assertEquals(KeyValueTextInputFormat.class, job.getInputFormat().getClass());
  
  args = new ArrayList<String>(dummyArgs);
  args.add("-inputformat");
  args.add("org.apache.hadoop.mapred.SequenceFileInputFormat");
  job = StreamJob.createJob(args.toArray(new String[] {}));
  assertEquals(SequenceFileInputFormat.class, job.getInputFormat().getClass());
  
  args = new ArrayList<String>(dummyArgs);
  args.add("-inputformat");
  args.add("org.apache.hadoop.mapred.KeyValueTextInputFormat");
  args.add("-inputreader");
  args.add("StreamXmlRecordReader,begin=<doc>,end=</doc>");
  job = StreamJob.createJob(args.toArray(new String[] {}));
  assertEquals(StreamInputFormat.class, job.getInputFormat().getClass());
}
 
Example #29
Source File: DirectoryIngestMapper.java    From hadoop-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void init(JobConf conf) throws IOException {
  // Expand the input path glob into a sequence file of inputs
  Path actualInput = new Path(conf.get(TEMP_DIR), "inputs.seq");
  expandGlob(conf, actualInput, FileInputFormat.getInputPaths(conf));

  // Configure the real M/R job
  conf.setInputFormat(SequenceFileInputFormat.class);
  FileInputFormat.setInputPaths(conf, actualInput);
  conf.setMapperClass(DirectoryIngestMapper.class);
}
 
Example #30
Source File: CrawlDbReader.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void processDumpJob(String crawlDb, String output, Configuration config, String format, String regex, String status, Integer retry) throws IOException {
  if (LOG.isInfoEnabled()) {
    LOG.info("CrawlDb dump: starting");
    LOG.info("CrawlDb db: " + crawlDb);
  }

  Path outFolder = new Path(output);

  JobConf job = new NutchJob(config);
  job.setJobName("dump " + crawlDb);

  FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  FileOutputFormat.setOutputPath(job, outFolder);

  if (format.equals("csv")) {
    job.setOutputFormat(CrawlDatumCsvOutputFormat.class);
  }
  else if (format.equals("crawldb")) {
    job.setOutputFormat(MapFileOutputFormat.class);
  } else {
    job.setOutputFormat(TextOutputFormat.class);
  }

  if (status != null) job.set("status", status);
  if (regex != null) job.set("regex", regex);
  if (retry != null) job.setInt("retry", retry);
  
  job.setMapperClass(CrawlDbDumpMapper.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(CrawlDatum.class);

  JobClient.runJob(job);
  if (LOG.isInfoEnabled()) { LOG.info("CrawlDb dump: done"); }
}