org.apache.hadoop.mapred.SequenceFileInputFormat Java Examples
The following examples show how to use
org.apache.hadoop.mapred.SequenceFileInputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NNBench.java From hadoop with Apache License 2.0 | 8 votes |
/** * Run the test * * @throws IOException on error */ public static void runTests() throws IOException { config.setLong("io.bytes.per.checksum", bytesPerChecksum); JobConf job = new JobConf(config, NNBench.class); job.setJobName("NNBench-" + operation); FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); // Explicitly set number of max map attempts to 1. job.setMaxMapAttempts(1); // Explicitly turn off speculative execution job.setSpeculativeExecution(false); job.setMapperClass(NNBenchMapper.class); job.setReducerClass(NNBenchReducer.class); FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME)); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks((int) numberOfReduces); JobClient.runJob(job); }
Example #2
Source File: SolrClean.java From anthelion with Apache License 2.0 | 7 votes |
public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrClean: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME)); job.setBoolean("noCommit", noCommit); job.set(SolrConstants.SERVER_URL, solrUrl); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(ByteWritable.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(DBFilter.class); job.setReducerClass(SolrDeleter.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("SolrClean: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
Example #3
Source File: NNBench.java From hadoop-gpu with Apache License 2.0 | 6 votes |
/** * Run the test * * @throws IOException on error */ public static void runTests() throws IOException { config.setLong("io.bytes.per.checksum", bytesPerChecksum); JobConf job = new JobConf(config, NNBench.class); job.setJobName("NNBench-" + operation); FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); // Explicitly set number of max map attempts to 1. job.setMaxMapAttempts(1); // Explicitly turn off speculative execution job.setSpeculativeExecution(false); job.setMapperClass(NNBenchMapper.class); job.setReducerClass(NNBenchReducer.class); FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME)); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks((int) numberOfReduces); JobClient.runJob(job); }
Example #4
Source File: TestDatamerge.java From big-c with Apache License 2.0 | 6 votes |
private static void joinAs(String jointype, Class<? extends SimpleCheckerBase> c) throws Exception { final int srcs = 4; Configuration conf = new Configuration(); JobConf job = new JobConf(conf, c); Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype)); Path[] src = writeSimpleSrc(base, conf, srcs); job.set("mapreduce.join.expr", CompositeInputFormat.compose(jointype, SequenceFileInputFormat.class, src)); job.setInt("testdatamerge.sources", srcs); job.setInputFormat(CompositeInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(base, "out")); job.setMapperClass(c); job.setReducerClass(c); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); JobClient.runJob(job); base.getFileSystem(job).delete(base, true); }
Example #5
Source File: TestDFSIO.java From big-c with Apache License 2.0 | 6 votes |
private void runIOTest( Class<? extends Mapper<Text, LongWritable, Text, Text>> mapperClass, Path outputDir) throws IOException { JobConf job = new JobConf(config, TestDFSIO.class); FileInputFormat.setInputPaths(job, getControlDir(config)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(mapperClass); job.setReducerClass(AccumulatingReducer.class); FileOutputFormat.setOutputPath(job, outputDir); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
Example #6
Source File: NNBench.java From big-c with Apache License 2.0 | 6 votes |
/** * Run the test * * @throws IOException on error */ public static void runTests() throws IOException { config.setLong("io.bytes.per.checksum", bytesPerChecksum); JobConf job = new JobConf(config, NNBench.class); job.setJobName("NNBench-" + operation); FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); // Explicitly set number of max map attempts to 1. job.setMaxMapAttempts(1); // Explicitly turn off speculative execution job.setSpeculativeExecution(false); job.setMapperClass(NNBenchMapper.class); job.setReducerClass(NNBenchReducer.class); FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME)); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks((int) numberOfReduces); JobClient.runJob(job); }
Example #7
Source File: DirectoryIngestMapperTest.java From hadoop-solr with Apache License 2.0 | 6 votes |
@Before public void setUp() throws Exception { conf = getDefaultDirectoryIngestMapperConfiguration(); Path dir = new Path(fs.getWorkingDirectory(), "build"); Path sub = new Path(dir, "DIMT"); Path tempDir = new Path(sub, "tmp-dir"); Path seqDir = new Path(sub, "seq-dir");// this is the location where the // fixture will write inputs.seq fs.mkdirs(tempDir); tempFiles = setupDir(fs, tempDir); conf.set(TEMP_DIR, seqDir.toString()); jobConf = new JobConf(conf); jobConf.setMapperClass(DirectoryIngestMapper.class); jobConf.setInputFormat(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(jobConf, OUTPUT_DIRECTORY_PATH); org.apache.hadoop.mapred.FileInputFormat.setInputPaths(jobConf, new Path(tempDir, "*")); Path[] paths = org.apache.hadoop.mapred.FileInputFormat.getInputPaths(jobConf); assertEquals(1, paths.length); }
Example #8
Source File: TestDatamerge.java From hadoop with Apache License 2.0 | 6 votes |
private static void joinAs(String jointype, Class<? extends SimpleCheckerBase> c) throws Exception { final int srcs = 4; Configuration conf = new Configuration(); JobConf job = new JobConf(conf, c); Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype)); Path[] src = writeSimpleSrc(base, conf, srcs); job.set("mapreduce.join.expr", CompositeInputFormat.compose(jointype, SequenceFileInputFormat.class, src)); job.setInt("testdatamerge.sources", srcs); job.setInputFormat(CompositeInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(base, "out")); job.setMapperClass(c); job.setReducerClass(c); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); JobClient.runJob(job); base.getFileSystem(job).delete(base, true); }
Example #9
Source File: SequenceFileIngestMapperTest.java From hadoop-solr with Apache License 2.0 | 6 votes |
@Test public void test() throws Exception { prepareFrankensteinSeqFileInput(); Configuration conf = getDefaultSequenceFileIngestMapperConfiguration(); Job job = createJobBasedOnConfiguration(conf, SequenceFileIngestMapper.class); ((JobConf)job.getConfiguration()).setInputFormat(SequenceFileInputFormat.class); List<String> results = runJobSuccessfully(job, 776); assertNumDocsProcessed(job, 776); assertEquals(776, results.size()); for (String docStr : results) { assertNotNull(docStr); } }
Example #10
Source File: LinkDbMerger.java From anthelion with Apache License 2.0 | 6 votes |
public static JobConf createMergeJob(Configuration config, Path linkDb, boolean normalize, boolean filter) { Path newLinkDb = new Path("linkdb-merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("linkdb merge " + linkDb); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(LinkDbFilter.class); job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize); job.setBoolean(LinkDbFilter.URL_FILTERING, filter); job.setReducerClass(LinkDbMerger.class); FileOutputFormat.setOutputPath(job, newLinkDb); job.setOutputFormat(MapFileOutputFormat.class); job.setBoolean("mapred.output.compress", true); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Inlinks.class); // https://issues.apache.org/jira/browse/NUTCH-1069 job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); return job; }
Example #11
Source File: TestFileSystem.java From RDFS with Apache License 2.0 | 6 votes |
public static void writeTest(FileSystem fs, boolean fastCheck) throws Exception { fs.delete(DATA_DIR, true); fs.delete(WRITE_DIR, true); JobConf job = new JobConf(conf, TestFileSystem.class); job.setBoolean("fs.test.fastCheck", fastCheck); FileInputFormat.setInputPaths(job, CONTROL_DIR); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(WriteMapper.class); job.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(job, WRITE_DIR); job.setOutputKeyClass(UTF8.class); job.setOutputValueClass(LongWritable.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
Example #12
Source File: TestFileSystem.java From RDFS with Apache License 2.0 | 6 votes |
public static void readTest(FileSystem fs, boolean fastCheck) throws Exception { fs.delete(READ_DIR, true); JobConf job = new JobConf(conf, TestFileSystem.class); job.setBoolean("fs.test.fastCheck", fastCheck); FileInputFormat.setInputPaths(job, CONTROL_DIR); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(ReadMapper.class); job.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(job, READ_DIR); job.setOutputKeyClass(UTF8.class); job.setOutputValueClass(LongWritable.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
Example #13
Source File: TestDatamerge.java From RDFS with Apache License 2.0 | 6 votes |
private static void joinAs(String jointype, Class<? extends SimpleCheckerBase> c) throws Exception { final int srcs = 4; Configuration conf = new Configuration(); JobConf job = new JobConf(conf, c); Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype)); Path[] src = writeSimpleSrc(base, conf, srcs); job.set("mapred.join.expr", CompositeInputFormat.compose(jointype, SequenceFileInputFormat.class, src)); job.setInt("testdatamerge.sources", srcs); job.setInputFormat(CompositeInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(base, "out")); job.setMapperClass(c); job.setReducerClass(c); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); JobClient.runJob(job); base.getFileSystem(job).delete(base, true); }
Example #14
Source File: NNBench.java From RDFS with Apache License 2.0 | 6 votes |
/** * Run the test * * @throws IOException on error */ public static void runTests(Configuration config) throws IOException { config.setLong("io.bytes.per.checksum", bytesPerChecksum); JobConf job = new JobConf(config, NNBench.class); job.setJobName("NNBench-" + operation); FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); // Explicitly set number of max map attempts to 1. job.setMaxMapAttempts(1); // Explicitly turn off speculative execution job.setSpeculativeExecution(false); job.setMapperClass(NNBenchMapper.class); job.setReducerClass(NNBenchReducer.class); FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME)); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks((int) numberOfReduces); JobClient.runJob(job); }
Example #15
Source File: DFSGeneralTest.java From RDFS with Apache License 2.0 | 6 votes |
private void updateJobConf(JobConf conf, Path inputPath, Path outputPath) { // set specific job config conf.setLong(NUMBER_OF_MAPS_KEY, nmaps); conf.setLong(NUMBER_OF_THREADS_KEY, nthreads); conf.setInt(BUFFER_SIZE_KEY, buffersize); conf.setLong(WRITER_DATARATE_KEY, datarate); conf.setLong("mapred.task.timeout", Long.MAX_VALUE); conf.set(OUTPUT_DIR_KEY, output); // set the output and input for the map reduce FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setNumReduceTasks(1); conf.setSpeculativeExecution(false); }
Example #16
Source File: LinkDbMerger.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public static JobConf createMergeJob(Configuration config, Path linkDb, boolean normalize, boolean filter) { Path newLinkDb = new Path("linkdb-merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("linkdb merge " + linkDb); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(LinkDbFilter.class); job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize); job.setBoolean(LinkDbFilter.URL_FILTERING, filter); job.setReducerClass(LinkDbMerger.class); FileOutputFormat.setOutputPath(job, newLinkDb); job.setOutputFormat(MapFileOutputFormat.class); job.setBoolean("mapred.output.compress", true); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Inlinks.class); // https://issues.apache.org/jira/browse/NUTCH-1069 job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); return job; }
Example #17
Source File: TestMultiMRInput.java From tez with Apache License 2.0 | 6 votes |
@Test(timeout = 5000) public void test0PhysicalInputs() throws Exception { Path workDir = new Path(TEST_ROOT_DIR, "testSingleSplit"); JobConf jobConf = new JobConf(defaultConf); jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class); FileInputFormat.setInputPaths(jobConf, workDir); InputContext inputContext = createTezInputContext(jobConf, new Configuration(false)); MultiMRInput mMrInput = new MultiMRInput(inputContext, 0); mMrInput.initialize(); mMrInput.start(); assertEquals(0, mMrInput.getKeyValueReaders().size()); List<Event> events = new LinkedList<>(); try { mMrInput.handleEvents(events); fail("HandleEvents should cause an input with 0 physical inputs to fail"); } catch (Exception e) { assertTrue(e instanceof IllegalStateException); } }
Example #18
Source File: TestFileSystem.java From RDFS with Apache License 2.0 | 6 votes |
public static void seekTest(FileSystem fs, boolean fastCheck) throws Exception { fs.delete(READ_DIR, true); JobConf job = new JobConf(conf, TestFileSystem.class); job.setBoolean("fs.test.fastCheck", fastCheck); FileInputFormat.setInputPaths(job,CONTROL_DIR); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(SeekMapper.class); job.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(job, READ_DIR); job.setOutputKeyClass(UTF8.class); job.setOutputValueClass(LongWritable.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
Example #19
Source File: TestDFSIO.java From hadoop with Apache License 2.0 | 6 votes |
private void runIOTest( Class<? extends Mapper<Text, LongWritable, Text, Text>> mapperClass, Path outputDir) throws IOException { JobConf job = new JobConf(config, TestDFSIO.class); FileInputFormat.setInputPaths(job, getControlDir(config)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(mapperClass); job.setReducerClass(AccumulatingReducer.class); FileOutputFormat.setOutputPath(job, outputDir); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
Example #20
Source File: TestDatamerge.java From hadoop-gpu with Apache License 2.0 | 6 votes |
private static void joinAs(String jointype, Class<? extends SimpleCheckerBase> c) throws Exception { final int srcs = 4; Configuration conf = new Configuration(); JobConf job = new JobConf(conf, c); Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype)); Path[] src = writeSimpleSrc(base, conf, srcs); job.set("mapred.join.expr", CompositeInputFormat.compose(jointype, SequenceFileInputFormat.class, src)); job.setInt("testdatamerge.sources", srcs); job.setInputFormat(CompositeInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(base, "out")); job.setMapperClass(c); job.setReducerClass(c); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); JobClient.runJob(job); base.getFileSystem(job).delete(base, true); }
Example #21
Source File: TestFileSystem.java From hadoop-gpu with Apache License 2.0 | 6 votes |
public static void seekTest(FileSystem fs, boolean fastCheck) throws Exception { fs.delete(READ_DIR, true); JobConf job = new JobConf(conf, TestFileSystem.class); job.setBoolean("fs.test.fastCheck", fastCheck); FileInputFormat.setInputPaths(job,CONTROL_DIR); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(SeekMapper.class); job.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(job, READ_DIR); job.setOutputKeyClass(UTF8.class); job.setOutputValueClass(LongWritable.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
Example #22
Source File: TestFileSystem.java From hadoop-gpu with Apache License 2.0 | 6 votes |
public static void writeTest(FileSystem fs, boolean fastCheck) throws Exception { fs.delete(DATA_DIR, true); fs.delete(WRITE_DIR, true); JobConf job = new JobConf(conf, TestFileSystem.class); job.setBoolean("fs.test.fastCheck", fastCheck); FileInputFormat.setInputPaths(job, CONTROL_DIR); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(WriteMapper.class); job.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(job, WRITE_DIR); job.setOutputKeyClass(UTF8.class); job.setOutputValueClass(LongWritable.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
Example #23
Source File: TestFileSystem.java From hadoop-gpu with Apache License 2.0 | 6 votes |
public static void readTest(FileSystem fs, boolean fastCheck) throws Exception { fs.delete(READ_DIR, true); JobConf job = new JobConf(conf, TestFileSystem.class); job.setBoolean("fs.test.fastCheck", fastCheck); FileInputFormat.setInputPaths(job, CONTROL_DIR); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(ReadMapper.class); job.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(job, READ_DIR); job.setOutputKeyClass(UTF8.class); job.setOutputValueClass(LongWritable.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
Example #24
Source File: LinkRank.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * Runs the inverter job. The inverter job flips outlinks to inlinks to be * passed into the analysis job. * * The inverter job takes a link loops database if it exists. It is an * optional componenet of link analysis due to its extreme computational and * space requirements but it can be very useful is weeding out and eliminating * link farms and other spam pages. * * @param nodeDb The node database to use. * @param outlinkDb The outlink database to use. * @param loopDb The loop database to use if it exists. * @param output The output directory. * * @throws IOException If an error occurs while running the inverter job. */ private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output) throws IOException { // configure the inverter JobConf inverter = new NutchJob(getConf()); inverter.setJobName("LinkAnalysis Inverter"); FileInputFormat.addInputPath(inverter, nodeDb); FileInputFormat.addInputPath(inverter, outlinkDb); // add the loop database if it exists, isn't null if (loopDb != null) { FileInputFormat.addInputPath(inverter, loopDb); } FileOutputFormat.setOutputPath(inverter, output); inverter.setInputFormat(SequenceFileInputFormat.class); inverter.setMapperClass(Inverter.class); inverter.setReducerClass(Inverter.class); inverter.setMapOutputKeyClass(Text.class); inverter.setMapOutputValueClass(ObjectWritable.class); inverter.setOutputKeyClass(Text.class); inverter.setOutputValueClass(LinkDatum.class); inverter.setOutputFormat(SequenceFileOutputFormat.class); inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the inverter job LOG.info("Starting inverter job"); try { JobClient.runJob(inverter); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished inverter job."); }
Example #25
Source File: CleaningJob.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void delete(String crawldb, boolean noCommit) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("CleaningJob: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME)); job.setBoolean("noCommit", noCommit); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(ByteWritable.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(DBFilter.class); job.setReducerClass(DeleterReducer.class); job.setJobName("CleaningJob"); // need to expicitely allow deletions job.setBoolean(IndexerMapReduce.INDEXER_DELETE, true); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("CleaningJob: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
Example #26
Source File: TestMultiMRInput.java From tez with Apache License 2.0 | 5 votes |
@Test(timeout = 5000) public void testSingleSplit() throws Exception { Path workDir = new Path(TEST_ROOT_DIR, "testSingleSplit"); JobConf jobConf = new JobConf(defaultConf); jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class); FileInputFormat.setInputPaths(jobConf, workDir); InputContext inputContext = createTezInputContext(jobConf, new Configuration(false)); MultiMRInput input = new MultiMRInput(inputContext, 1); input.initialize(); AtomicLong inputLength = new AtomicLong(); LinkedHashMap<LongWritable, Text> data = createSplits(1, workDir, jobConf, inputLength); SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>(); InputSplit[] splits = format.getSplits(jobConf, 1); assertEquals(1, splits.length); MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]); InputDataInformationEvent event = InputDataInformationEvent.createWithSerializedPayload(0, splitProto.toByteString().asReadOnlyByteBuffer()); List<Event> eventList = new ArrayList<Event>(); eventList.add(event); input.handleEvents(eventList); assertReaders(input, data, 1, inputLength.get()); }
Example #27
Source File: TestMultiMRInput.java From tez with Apache License 2.0 | 5 votes |
@Test(timeout = 5000) public void testMultipleSplits() throws Exception { Path workDir = new Path(TEST_ROOT_DIR, "testMultipleSplits"); JobConf jobConf = new JobConf(defaultConf); jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class); FileInputFormat.setInputPaths(jobConf, workDir); InputContext inputContext = createTezInputContext(jobConf, new Configuration(false)); MultiMRInput input = new MultiMRInput(inputContext, 2); input.initialize(); AtomicLong inputLength = new AtomicLong(); LinkedHashMap<LongWritable, Text> data = createSplits(2, workDir, jobConf, inputLength); SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>(); InputSplit[] splits = format.getSplits(jobConf, 2); assertEquals(2, splits.length); MRSplitProto splitProto1 = MRInputHelpers.createSplitProto(splits[0]); InputDataInformationEvent event1 = InputDataInformationEvent.createWithSerializedPayload(0, splitProto1.toByteString().asReadOnlyByteBuffer()); MRSplitProto splitProto2 = MRInputHelpers.createSplitProto(splits[1]); InputDataInformationEvent event2 = InputDataInformationEvent.createWithSerializedPayload(0, splitProto2.toByteString().asReadOnlyByteBuffer()); List<Event> eventList = new ArrayList<Event>(); eventList.add(event1); eventList.add(event2); input.handleEvents(eventList); assertReaders(input, data, 2, inputLength.get()); }
Example #28
Source File: TestStreamJob.java From big-c with Apache License 2.0 | 5 votes |
@Test public void testCreateJob() throws IOException { JobConf job; ArrayList<String> dummyArgs = new ArrayList<String>(); dummyArgs.add("-input"); dummyArgs.add("dummy"); dummyArgs.add("-output"); dummyArgs.add("dummy"); dummyArgs.add("-mapper"); dummyArgs.add("dummy"); dummyArgs.add("-reducer"); dummyArgs.add("dummy"); ArrayList<String> args; args = new ArrayList<String>(dummyArgs); args.add("-inputformat"); args.add("org.apache.hadoop.mapred.KeyValueTextInputFormat"); job = StreamJob.createJob(args.toArray(new String[] {})); assertEquals(KeyValueTextInputFormat.class, job.getInputFormat().getClass()); args = new ArrayList<String>(dummyArgs); args.add("-inputformat"); args.add("org.apache.hadoop.mapred.SequenceFileInputFormat"); job = StreamJob.createJob(args.toArray(new String[] {})); assertEquals(SequenceFileInputFormat.class, job.getInputFormat().getClass()); args = new ArrayList<String>(dummyArgs); args.add("-inputformat"); args.add("org.apache.hadoop.mapred.KeyValueTextInputFormat"); args.add("-inputreader"); args.add("StreamXmlRecordReader,begin=<doc>,end=</doc>"); job = StreamJob.createJob(args.toArray(new String[] {})); assertEquals(StreamInputFormat.class, job.getInputFormat().getClass()); }
Example #29
Source File: DirectoryIngestMapper.java From hadoop-solr with Apache License 2.0 | 5 votes |
@Override public void init(JobConf conf) throws IOException { // Expand the input path glob into a sequence file of inputs Path actualInput = new Path(conf.get(TEMP_DIR), "inputs.seq"); expandGlob(conf, actualInput, FileInputFormat.getInputPaths(conf)); // Configure the real M/R job conf.setInputFormat(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(conf, actualInput); conf.setMapperClass(DirectoryIngestMapper.class); }
Example #30
Source File: CrawlDbReader.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void processDumpJob(String crawlDb, String output, Configuration config, String format, String regex, String status, Integer retry) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb dump: starting"); LOG.info("CrawlDb db: " + crawlDb); } Path outFolder = new Path(output); JobConf job = new NutchJob(config); job.setJobName("dump " + crawlDb); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, outFolder); if (format.equals("csv")) { job.setOutputFormat(CrawlDatumCsvOutputFormat.class); } else if (format.equals("crawldb")) { job.setOutputFormat(MapFileOutputFormat.class); } else { job.setOutputFormat(TextOutputFormat.class); } if (status != null) job.set("status", status); if (regex != null) job.set("regex", regex); if (retry != null) job.setInt("retry", retry); job.setMapperClass(CrawlDbDumpMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb dump: done"); } }