org.apache.nutch.util.NutchConfiguration Java Examples

The following examples show how to use org.apache.nutch.util.NutchConfiguration. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestExtParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ParseException {
  String contentType;

  // now test only on linux platform
  if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
    System.err.println("Current OS is "+System.getProperty("os.name")+".");
    System.err.println("No test is run on OS other than linux.");
    return;
  }

  Configuration conf = NutchConfiguration.create();
  // loop alternately, total 10*2 times of invoking external command
  for (int i=0; i<10; i++) {
    // check external parser that does 'cat'
    contentType = "application/vnd.nutch.example.cat";
    content.setContentType(contentType);
    parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
    assertEquals(expectedText,parse.getText());

    // check external parser that does 'md5sum'
    contentType = "application/vnd.nutch.example.md5sum";
    content.setContentType(contentType);
    parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
    assertTrue(parse.getText().startsWith(expectedMD5sum));
  }
}
 
Example #2
Source File: TestDomainBlacklistURLFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public void testFilter()
  throws Exception {

  String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
  Configuration conf = NutchConfiguration.create();
  DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(domainBlacklistFile);
  domainBlacklistFilter.setConf(conf);
  assertNull(domainBlacklistFilter.filter("http://lucene.apache.org"));
  assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));
  assertNull(domainBlacklistFilter.filter("http://www.apache.org"));
  assertNotNull(domainBlacklistFilter.filter("http://www.google.com"));
  assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com"));
  assertNull(domainBlacklistFilter.filter("http://www.foobar.net"));
  assertNull(domainBlacklistFilter.filter("http://www.foobas.net"));
  assertNull(domainBlacklistFilter.filter("http://www.yahoo.com"));
  assertNull(domainBlacklistFilter.filter("http://www.foobar.be"));
  assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com"));
}
 
Example #3
Source File: LinkReader.java    From nutchpy with Apache License 2.0 6 votes vote down vote up
public static long count(String path) throws IOException  {
    //read rows between start and stop

    Configuration conf = NutchConfiguration.create();
    FileSystem fs = FileSystem.get(conf);

    Path file = new Path(path);
    System.out.println(file);

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);

    Writable key = (Writable)
            ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable value = (Writable)
            ReflectionUtils.newInstance(reader.getValueClass(), conf);


    //skip rows
    long i = 0;


    while(reader.next(key, value)) {
        i += 1;
    }
    return i;
}
 
Example #4
Source File: TestMetatagParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public void testIt() {
  Configuration conf = NutchConfiguration.create();
  
  String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
  
  try {
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    
    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
    
    // check that we get the same values
    Metadata parseMeta = parse.getData().getParseMeta();
    
    assertEquals(description, parseMeta.get("metatag.description"));
    assertEquals(keywords, parseMeta.get("metatag.keywords"));
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.toString());
  }
}
 
Example #5
Source File: WdcParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	// LOG.setLevel(Level.FINE);
	String name = args[0];
	String url = "file:" + name;
	File file = new File(name);
	byte[] bytes = new byte[(int) file.length()];
	DataInputStream in = new DataInputStream(new FileInputStream(file));
	in.readFully(bytes);
	Configuration conf = NutchConfiguration.create();
	WdcParser parser = new WdcParser();
	parser.setConf(conf);
	Parse parse = parser.getParse(new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
	System.out.println("data: " + parse.getData());

	System.out.println("text: " + parse.getText());

	String contains = parse.getData().getMeta(META_CONTAINS_SEM);
	System.out.println("contains: " + contains);

}
 
Example #6
Source File: TestSubcollection.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**Test filtering logic
 * 
 * @throws Exception
 */
public void testFilter() throws Exception {
  Subcollection sc=new Subcollection(NutchConfiguration.create());
  sc.setWhiteList("www.nutch.org\nwww.apache.org");
  sc.setBlackList("jpg\nwww.apache.org/zecret/");
  
  //matches whitelist
  assertEquals("http://www.apache.org/index.html", sc.filter("http://www.apache.org/index.html"));
  
  //matches blacklist
  assertEquals(null, sc.filter("http://www.apache.org/zecret/index.html"));
  assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg"));
  
  //no match
  assertEquals(null, sc.filter("http://www.google.com/"));
}
 
Example #7
Source File: TestExtParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
protected void setUp() throws ProtocolException, IOException {
  // prepare a temp file with expectedText as its content
  // This system property is defined in ./src/plugin/build-plugin.xml
  String path = System.getProperty("test.data");
  if (path != null) {
    File tempDir = new File(path);
    if (!tempDir.exists())
      tempDir.mkdir();
    tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt",tempDir);
  } else {
    // otherwise in java.io.tmpdir
    tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt");
  }
  urlString = tempFile.toURL().toString();

  FileOutputStream fos = new FileOutputStream(tempFile);
  fos.write(expectedText.getBytes());
  fos.close();

  // get nutch content
  Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
  content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
  protocol = null;
}
 
Example #8
Source File: TestRegexURLNormalizer.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public TestRegexURLNormalizer(String name) throws IOException {
  super(name);
  normalizer = new RegexURLNormalizer();
  conf = NutchConfiguration.create();
  normalizer.setConf(conf);
  File[] configs = new File(sampleDir).listFiles(new FileFilter() {
    public boolean accept(File f) {
      if (f.getName().endsWith(".xml") && f.getName().startsWith("regex-normalize-"))
        return true;
      return false;
    }
  });
  for (int i = 0; i < configs.length; i++) {
    try {
      FileReader reader = new FileReader(configs[i]);
      String cname = configs[i].getName();
      cname = cname.substring(16, cname.indexOf(".xml"));
      normalizer.setConfiguration(reader, cname);
      NormalizedURL[] urls = readTestFile(cname);
      testData.put(cname, urls);
    } catch (Exception e) {
      LOG.warn("Could load config from '" + configs[i] + "': " + e.toString());
    }
  }
}
 
Example #9
Source File: TestHTMLLanguageParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * Test parsing of language identifiers from html 
 **/
public void testMetaHTMLParsing() {

  try {
    ParseUtil parser = new ParseUtil(NutchConfiguration.create());
    /* loop through the test documents and validate result */
    for (int t = 0; t < docs.length; t++) {
      Content content = getContent(docs[t]);
      Parse parse = parser.parse(content).get(content.getUrl());
      assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
    }
  } catch (Exception e) {
    e.printStackTrace(System.out);
    fail(e.toString());
  }

}
 
Example #10
Source File: TestPdfParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

    int index = parse.getText().indexOf(expectedText);
    assertTrue(index > 0);
  }
}
 
Example #11
Source File: TestHTMLLanguageParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/**
 * Test parsing of language identifiers from html 
 **/
public void testMetaHTMLParsing() {

  try {
    ParseUtil parser = new ParseUtil(NutchConfiguration.create());
    /* loop through the test documents and validate result */
    for (int t = 0; t < docs.length; t++) {
      Content content = getContent(docs[t]);
      Parse parse = parser.parse(content).get(content.getUrl());
      assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
    }
  } catch (Exception e) {
    e.printStackTrace(System.out);
    fail(e.toString());
  }

}
 
Example #12
Source File: TestCCParseFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void pageTest(File file, String url,
                     String license, String location, String type)
  throws Exception {

  String contentType = "text/html";
  InputStream in = new FileInputStream(file);
  ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
  byte[] buffer = new byte[1024];
  int i;
  while ((i = in.read(buffer)) != -1) {
    out.write(buffer, 0, i);
  }
  in.close();
  byte[] bytes = out.toByteArray();
  Configuration conf = NutchConfiguration.create();

  Content content =
    new Content(url, url, bytes, contentType, new Metadata(), conf);
  Parse parse =  new ParseUtil(conf).parse(content).get(content.getUrl());
  
  Metadata metadata = parse.getData().getParseMeta();
  assertEquals(license, metadata.get("License-Url"));
  assertEquals(location, metadata.get("License-Location"));
  assertEquals(type, metadata.get("Work-Type"));
}
 
Example #13
Source File: SuffixURLFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public static void main(String args[]) throws IOException {

    SuffixURLFilter filter;
    if (args.length >= 1)
      filter = new SuffixURLFilter(new FileReader(args[0]));
    else {
      filter = new SuffixURLFilter();
      filter.setConf(NutchConfiguration.create());
    }

    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    String line;
    while ((line = in.readLine()) != null) {
      String out = filter.filter(line);
      if (out != null) {
        System.out.println("ACCEPTED " + out);
      } else {
        System.out.println("REJECTED " + out);
      }
    }
  }
 
Example #14
Source File: IndexManager.java    From spacewalk with GNU General Public License v2.0 5 votes vote down vote up
private boolean initDocSummary() {
    /**
     * NOTE:  NutchConfiguration is expecting "nutch-default.xml" and "nutch-site.xml"
     * to be available in the CLASSPATH
     */
    try {
        nutchConf = NutchConfiguration.create();
        nutchAnalyzerFactory = new AnalyzerFactory(nutchConf);
        FileSystem fs = FileSystem.get(nutchConf);
        docSegments = new TreeMap<String, FetchedSegments>
                                                            (String.CASE_INSENSITIVE_ORDER);
        for (String key : docLocaleLookUp.keySet()) {
            String segmentsDir = indexWorkDir + File.separator +
                getDocIndexPath(key) + File.separator + "segments";
            FetchedSegments segments = new FetchedSegments(fs, segmentsDir, nutchConf);
            if (segments == null) {
                log.info("Unable to create docSegments for language: " + key);
                docSegments.put(key, null);
            }
            String[] segNames = segments.getSegmentNames();
            if (segNames == null || segNames.length == 0) {
                log.info("Unable to find any segments for language: " + key);
                docSegments.put(key, null);
            }
            log.info("Adding Documentation segments for language: " + key);
            docSegments.put(key, segments);
        }
    }
    catch (Exception e) {
        log.error("ignoring exception - most likely Nutch isn't present, so" +
        " doc summaries will be empty");
        e.printStackTrace();
    }
    return true;
}
 
Example #15
Source File: NodeReader.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * Runs the NodeReader tool.  The command line arguments must contain a 
 * webgraphdb path and a url.  The url must match the normalized url that is
 * contained in the NodeDb of the WebGraph.
 */
public static void main(String[] args)
  throws Exception {

  Options options = new Options();
  Option helpOpts = OptionBuilder.withArgName("help").withDescription(
    "show this help message").create("help");
  Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg()
    .withDescription("the webgraphdb to use").create("webgraphdb");
  Option urlOpts = OptionBuilder.withArgName("url").hasOptionalArg()
    .withDescription("the url to dump").create("url");
  options.addOption(helpOpts);
  options.addOption(webGraphOpts);
  options.addOption(urlOpts);

  CommandLineParser parser = new GnuParser();
  try {

    // command line must take a webgraphdb and a url
    CommandLine line = parser.parse(options, args);
    if (line.hasOption("help") || !line.hasOption("webgraphdb")
      || !line.hasOption("url")) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp("WebGraphReader", options);
      return;
    }

    // dump the values to system out and return
    String webGraphDb = line.getOptionValue("webgraphdb");
    String url = line.getOptionValue("url");
    NodeReader reader = new NodeReader(NutchConfiguration.create());
    reader.dumpUrl(new Path(webGraphDb), url);
    
    return;
  }
  catch (Exception e) {
    e.printStackTrace();
    return;
  }
}
 
Example #16
Source File: TestIndexingFilters.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Test behaviour when NutchDOcument is null
 */

public void testNutchDocumentNullIndexingFilter() throws IndexingException{
  Configuration conf = NutchConfiguration.create();
  conf.addResource("nutch-default.xml");
  conf.addResource("crawl-tests.xml");

  IndexingFilters filters = new IndexingFilters(conf);
  NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
    "http://www.example.com/"), new CrawlDatum(), new Inlinks());
   
  assertNull(doc);
}
 
Example #17
Source File: LoopReader.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * Runs the LoopReader tool.  For this tool to work the loops job must have
 * already been run on the corresponding WebGraph.
 */
public static void main(String[] args)
  throws Exception {

  Options options = new Options();
  Option helpOpts = OptionBuilder.withArgName("help").withDescription(
    "show this help message").create("help");
  Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg()
    .withDescription("the webgraphdb to use").create("webgraphdb");
  Option urlOpts = OptionBuilder.withArgName("url").hasOptionalArg()
    .withDescription("the url to dump").create("url");
  options.addOption(helpOpts);
  options.addOption(webGraphOpts);
  options.addOption(urlOpts);

  CommandLineParser parser = new GnuParser();
  try {

    CommandLine line = parser.parse(options, args);
    if (line.hasOption("help") || !line.hasOption("webgraphdb")
      || !line.hasOption("url")) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp("WebGraphReader", options);
      return;
    }

    String webGraphDb = line.getOptionValue("webgraphdb");
    String url = line.getOptionValue("url");
    LoopReader reader = new LoopReader(NutchConfiguration.create());
    reader.dumpUrl(new Path(webGraphDb), url);
    return;
  }
  catch (Exception e) {
    e.printStackTrace();
    return;
  }
}
 
Example #18
Source File: ParseData.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public static void main(String argv[]) throws Exception {
  String usage = "ParseData (-local | -dfs <namenode:port>) recno segment";
  
  if (argv.length < 3) {
    System.out.println("usage:" + usage);
    return;
  }

  Options opts = new Options();
  Configuration conf = NutchConfiguration.create();
  
  GenericOptionsParser parser =
    new GenericOptionsParser(conf, opts, argv);
  
  String[] remainingArgs = parser.getRemainingArgs();
  FileSystem fs = FileSystem.get(conf);
  
  try {
    int recno = Integer.parseInt(remainingArgs[0]);
    String segment = remainingArgs[1];

    Path file = new Path(segment, DIR_NAME);
    System.out.println("Reading from file: " + file);

    ArrayFile.Reader parses = new ArrayFile.Reader(fs, file.toString(), conf);

    ParseData parseDatum = new ParseData();
    parses.get(recno, parseDatum);

    System.out.println("Retrieved " + recno + " from file " + file);
    System.out.println(parseDatum);

    parses.close();
  } finally {
    fs.close();
  }
}
 
Example #19
Source File: LinkReader.java    From nutchpy with Apache License 2.0 5 votes vote down vote up
public static List head(int nrows, String path) throws IOException {
    // reads the entire contents of the file

    List<HashMap> rows=new ArrayList<HashMap>();

    Configuration conf = NutchConfiguration.create();
    FileSystem fs = FileSystem.get(conf);

    Path file = new Path(path);

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);

    Writable key = (Writable)
            ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    LinkDatum value = new LinkDatum();

    int i = 0;
    while(reader.next(key, value)) {

        if (i == nrows) {
            break;
        }
        i += 1;
        try {
            HashMap<String, String> t_row = getLinksRow(key,value);
            rows.add(t_row);
        }
        catch (Exception e) {
        }
    }

    return rows;
}
 
Example #20
Source File: TestOOParser.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();
  Protocol protocol;
  ProtocolFactory factory = new ProtocolFactory(conf);

  System.out.println("Expected : "+expectedText);
  
  for (int i=0; i<sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    if (sampleFiles[i].startsWith("ootest")==false) continue;
    
    protocol = factory.getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    
    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();

    // simply test for the presence of a text - the ordering of the elements may differ from what was expected
    // in the previous tests
    assertTrue(text!=null && text.length() > 0);
    
    System.out.println("Found "+sampleFiles[i]+": "+text);
  }
}
 
Example #21
Source File: TestParserFactory.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/** Inits the Test Case with the test parse-plugin file */
protected void setUp() throws Exception {
    conf = NutchConfiguration.create();
    conf.set("plugin.includes", ".*");
    conf.set("parse.plugin.file",
             "org/apache/nutch/parse/parse-plugin-test.xml");
    parserFactory = new ParserFactory(conf);
}
 
Example #22
Source File: LinkDumper.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args)
  throws Exception {
  
  if (args == null || args.length < 2) {
    System.out.println("LinkDumper$Reader usage: <webgraphdb> <url>");
    return;
  }

  // open the readers for the linkdump directory
  Configuration conf = NutchConfiguration.create();
  FileSystem fs = FileSystem.get(conf);
  Path webGraphDb = new Path(args[0]);
  String url = args[1];
  MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(
    webGraphDb, DUMP_DIR), conf);

  // get the link nodes for the url
  Text key = new Text(url);
  LinkNodes nodes = new LinkNodes();
  MapFileOutputFormat.getEntry(readers,
    new HashPartitioner<Text, LinkNodes>(), key, nodes);

  // print out the link nodes
  LinkNode[] linkNodesAr = nodes.getLinks();
  System.out.println(url + ":");
  for (LinkNode node : linkNodesAr) {
    System.out.println("  " + node.getUrl() + " - "
      + node.getNode().toString());
  }

  // close the readers
  FSUtils.closeReaders(readers);
}
 
Example #23
Source File: TestMoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void testContentDispositionTitle() throws IndexingException {
  Configuration conf = NutchConfiguration.create();

  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());

  assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
}
 
Example #24
Source File: IndexManager.java    From uyuni with GNU General Public License v2.0 5 votes vote down vote up
private boolean initDocSummary() {
    /**
     * NOTE:  NutchConfiguration is expecting "nutch-default.xml" and "nutch-site.xml"
     * to be available in the CLASSPATH
     */
    try {
        nutchConf = NutchConfiguration.create();
        nutchAnalyzerFactory = new AnalyzerFactory(nutchConf);
        FileSystem fs = FileSystem.get(nutchConf);
        docSegments = new TreeMap<String, FetchedSegments>
                                                            (String.CASE_INSENSITIVE_ORDER);
        for (String key : docLocaleLookUp.keySet()) {
            String segmentsDir = indexWorkDir + File.separator +
                getDocIndexPath(key) + File.separator + "segments";
            FetchedSegments segments = new FetchedSegments(fs, segmentsDir, nutchConf);
            if (segments == null) {
                log.info("Unable to create docSegments for language: " + key);
                docSegments.put(key, null);
            }
            String[] segNames = segments.getSegmentNames();
            if (segNames == null || segNames.length == 0) {
                log.info("Unable to find any segments for language: " + key);
                docSegments.put(key, null);
            }
            log.info("Adding Documentation segments for language: " + key);
            docSegments.put(key, segments);
        }
    }
    catch (Exception e) {
        log.error("ignoring exception - most likely Nutch isn't present, so" +
        " doc summaries will be empty");
        e.printStackTrace();
    }
    return true;
}
 
Example #25
Source File: URLNormalizerChecker.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

    String usage = "Usage: URLNormalizerChecker [-normalizer <normalizerName>] [-scope <scope>]"
      + "\n\tscope can be one of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink";

    String normalizerName = null;
    String scope = URLNormalizers.SCOPE_DEFAULT;
    for (int i = 0; i < args.length; i++) {
      if (args[i].equals("-normalizer")) {
        normalizerName = args[++i];
      } else if (args[i].equals("-scope")) {
        scope = args[++i];
      } else {
        System.err.println(usage);
        System.exit(-1);
      }
    }

    URLNormalizerChecker checker = new URLNormalizerChecker(NutchConfiguration.create());
    if (normalizerName != null) {
      checker.checkOne(normalizerName, scope);
    } else {
      checker.checkAll(scope);
    }

    System.exit(0);
  }
 
Example #26
Source File: LinkReader.java    From nutchpy with Apache License 2.0 5 votes vote down vote up
public static List read(String path) throws IOException {
    // reads the entire contents of the file

    List<HashMap> rows=new ArrayList<HashMap>();

    Configuration conf = NutchConfiguration.create();
    FileSystem fs = FileSystem.get(conf);

    Path file = new Path(path);

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);

    Writable key = (Writable)
            ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    LinkDatum value = new LinkDatum();

    while(reader.next(key, value)) {
        try {
            HashMap<String, String> t_row = getLinksRow(key,value);
            rows.add(t_row);
        }
        catch (Exception e) {
        }
    }

    return rows;
}
 
Example #27
Source File: TestStaticFieldIndexerTest.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
protected void setUp() throws Exception {
  conf = NutchConfiguration.create();
  parse = new ParseImpl();
  url = new Text("http://nutch.apache.org/index.html");
  crawlDatum = new CrawlDatum();
  inlinks = new Inlinks();
  filter = new StaticFieldIndexer();
}
 
Example #28
Source File: TestLinkDbMerger.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public void setUp() throws Exception {
  init1.put(url10, urls10);
  init1.put(url11, urls11);
  init2.put(url20, urls20);
  init2.put(url21, urls21);
  expected.put(url10, urls10_expected);
  expected.put(url11, urls11_expected);
  expected.put(url20, urls20_expected);
  expected.put(url21, urls21_expected);
  conf = NutchConfiguration.create();
  fs = FileSystem.get(conf);
  testDir = new Path("build/test/test-linkdb-" +
          new java.util.Random().nextInt());
  fs.mkdirs(testDir);
}
 
Example #29
Source File: TestPassURLNormalizer.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public void testPassURLNormalizer() {
  Configuration conf = NutchConfiguration.create();
  
  PassURLNormalizer normalizer = new PassURLNormalizer();
  normalizer.setConf(conf);
  String url = "http://www.example.com/test/..//";
  String result = null;
  try {
    result = normalizer.normalize(url, URLNormalizers.SCOPE_DEFAULT);
  } catch (MalformedURLException mue) {
    fail(mue.toString());
  }
  
  assertEquals(url, result);
}
 
Example #30
Source File: TestOOParser.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();
  Protocol protocol;
  ProtocolFactory factory = new ProtocolFactory(conf);

  System.out.println("Expected : "+expectedText);
  
  for (int i=0; i<sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    if (sampleFiles[i].startsWith("ootest")==false) continue;
    
    protocol = factory.getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    
    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();

    // simply test for the presence of a text - the ordering of the elements may differ from what was expected
    // in the previous tests
    assertTrue(text!=null && text.length() > 0);
    
    System.out.println("Found "+sampleFiles[i]+": "+text);
  }
}