org.apache.nutch.parse.Parse Java Examples

The following examples show how to use org.apache.nutch.parse.Parse. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: LanguageIndexingFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}
 
Example #2
Source File: RelTagParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/**
 * Scan the HTML document looking at possible rel-tags
 */
public ParseResult filter(Content content, ParseResult parseResult,
  HTMLMetaTags metaTags, DocumentFragment doc) {
  
  // get parse obj
  Parse parse = parseResult.get(content.getUrl());
  // Trying to find the document's rel-tags
  Parser parser = new Parser(doc);
  Set tags = parser.getRelTags();
  Iterator iter = tags.iterator();
  Metadata metadata = parse.getData().getParseMeta();
  while (iter.hasNext()) {
    metadata.add(REL_TAG, (String) iter.next());
  }
  return parseResult;
}
 
Example #3
Source File: TLDScoringFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
    throws ScoringFilterException {

  NutchField tlds = doc.getField("tld");
  float boost = 1.0f;

  if(tlds != null) {
    for(Object tld : tlds.getValues()) {
      DomainSuffix entry = tldEntries.get(tld.toString());
      if(entry != null)
        boost *= entry.getBoost();
    }
  }
  return initScore * boost;
}
 
Example #4
Source File: RelTagParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * Scan the HTML document looking at possible rel-tags
 */
public ParseResult filter(Content content, ParseResult parseResult,
  HTMLMetaTags metaTags, DocumentFragment doc) {
  
  // get parse obj
  Parse parse = parseResult.get(content.getUrl());
  // Trying to find the document's rel-tags
  Parser parser = new Parser(doc);
  Set<?> tags = parser.getRelTags();
  Iterator<?> iter = tags.iterator();
  Metadata metadata = parse.getData().getParseMeta();
  while (iter.hasNext())
    metadata.add(REL_TAG, (String) iter.next());

  return parseResult;
}
 
Example #5
Source File: URLMetaIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf != null)
		this.setConf(conf);

	if (urlMetaTags == null || doc == null)
		return doc;

	for (String metatag : urlMetaTags) {
		Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

		if (metadata != null)
			doc.add(metatag, metadata.toString());
	}

	return doc;
}
 
Example #6
Source File: TestImageMetadata.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

    assertEquals("121", parse.getData().getMeta("width"));
    assertEquals("48", parse.getData().getMeta("height"));
  }
}
 
Example #7
Source File: LanguageIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}
 
Example #8
Source File: JSParseFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public ParseResult filter(Content content, ParseResult parseResult,
  HTMLMetaTags metaTags, DocumentFragment doc) {

  Parse parse = parseResult.get(content.getUrl());

  String url = content.getBaseUrl();
  ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
  walk(doc, parse, metaTags, url, outlinks);
  if (outlinks.size() > 0) {
    Outlink[] old = parse.getData().getOutlinks();
    String title = parse.getData().getTitle();
    List<Outlink> list = Arrays.asList(old);
    outlinks.addAll(list);
    ParseStatus status = parse.getData().getStatus();
    String text = parse.getText();
    Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
    ParseData parseData = new ParseData(status, title, newlinks,
                                        parse.getData().getContentMeta(),
                                        parse.getData().getParseMeta());

    // replace original parse obj with new one
    parseResult.put(content.getUrl(), new ParseText(text), parseData);
  }
  return parseResult;
}
 
Example #9
Source File: HTMLLanguageParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/** Try to find the document's language from page headers and metadata */
private String detectLanguage(Parse page, DocumentFragment doc) {
    String lang = getLanguageFromMetadata(page.getData().getParseMeta());
    if (lang == null) {
        LanguageParser parser = new LanguageParser(doc);
        lang = parser.getLanguage();
    }

    if (lang != null) {
        return lang;
    }

    lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);

    return lang;
}
 
Example #10
Source File: TestZipParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  Configuration conf = NutchConfiguration.create();
  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
    assertTrue(parse.getText().equals(expectedText));
  }
}
 
Example #11
Source File: TestSWFParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

    parse = new ParseUtil(conf).parse(content).get(content.getUrl());

    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
    assertTrue(sampleTexts[i].equals(text));
  }
}
 
Example #12
Source File: TestPdfParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

    int index = parse.getText().indexOf(expectedText);
    assertTrue(index > 0);
  }
}
 
Example #13
Source File: URLMetaIndexingFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf != null)
		this.setConf(conf);

	if (urlMetaTags == null || doc == null)
		return doc;

	for (String metatag : urlMetaTags) {
		Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

		if (metadata != null)
			doc.add(metatag, metadata.toString());
	}

	return doc;
}
 
Example #14
Source File: TestSWFParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

    parse = new ParseUtil(conf).parse(content).get(content.getUrl());

    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
    assertTrue(sampleTexts[i].equals(text));
  }
}
 
Example #15
Source File: TestCCParseFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public void pageTest(File file, String url,
                     String license, String location, String type)
  throws Exception {

  String contentType = "text/html";
  InputStream in = new FileInputStream(file);
  ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
  byte[] buffer = new byte[1024];
  int i;
  while ((i = in.read(buffer)) != -1) {
    out.write(buffer, 0, i);
  }
  in.close();
  byte[] bytes = out.toByteArray();
  Configuration conf = NutchConfiguration.create();

  Content content =
    new Content(url, url, bytes, contentType, new Metadata(), conf);
  Parse parse =  new ParseUtil(conf).parse(content).get(content.getUrl());
  
  Metadata metadata = parse.getData().getParseMeta();
  assertEquals(license, metadata.get("License-Url"));
  assertEquals(location, metadata.get("License-Location"));
  assertEquals(type, metadata.get("Work-Type"));
}
 
Example #16
Source File: TLDScoringFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
    throws ScoringFilterException {

  NutchField tlds = doc.getField("tld");
  float boost = 1.0f;

  if(tlds != null) {
    for(Object tld : tlds.getValues()) {
      DomainSuffix entry = tldEntries.get(tld.toString());
      if(entry != null)
        boost *= entry.getBoost();
    }
  }
  return initScore * boost;
}
 
Example #17
Source File: JSParseFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public ParseResult filter(Content content, ParseResult parseResult,
  HTMLMetaTags metaTags, DocumentFragment doc) {

  Parse parse = parseResult.get(content.getUrl());

  String url = content.getBaseUrl();
  ArrayList outlinks = new ArrayList();
  walk(doc, parse, metaTags, url, outlinks);
  if (outlinks.size() > 0) {
    Outlink[] old = parse.getData().getOutlinks();
    String title = parse.getData().getTitle();
    List list = Arrays.asList(old);
    outlinks.addAll(list);
    ParseStatus status = parse.getData().getStatus();
    String text = parse.getText();
    Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
    ParseData parseData = new ParseData(status, title, newlinks,
                                        parse.getData().getContentMeta(),
                                        parse.getData().getParseMeta());

    // replace original parse obj with new one
    parseResult.put(content.getUrl(), new ParseText(text), parseData);
  }
  return parseResult;
}
 
Example #18
Source File: HeadingsParseFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
  this.doc = doc;

  String heading;
  Parse parse = parseResult.get(content.getUrl());

  for (int i = 0 ; headings != null && i < headings.length ; i++ ) {
    heading = getElement(headings[i]);

    if (heading != null) {
      heading.trim();

      if (heading.length() > 0) {
        parse.getData().getParseMeta().set(headings[i], heading);
      }
    }
  }

  return parseResult;
}
 
Example #19
Source File: FeedParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/**
 * Runs a command line version of this {@link Parser}.
 * 
 * @param args
 *          A single argument (expected at arg[0]) representing a path on the
 *          local filesystem that points to a feed file.
 * 
 * @throws Exception
 *           If any error occurs.
 */
public static void main(String[] args) throws Exception {
  if (args.length != 1) {
    System.err.println("Usage: FeedParser <feed>");
    System.exit(1);
  }
  String name = args[0];
  String url = "file:" + name;
  Configuration conf = NutchConfiguration.create();
  FeedParser parser = new FeedParser();
  parser.setConf(conf);
  File file = new File(name);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
      "application/rss+xml", new Metadata(), conf));
  for (Entry<Text, Parse> entry : parseResult) {
    System.out.println("key: " + entry.getKey());
    Parse parse = entry.getValue();
    System.out.println("data: " + parse.getData());
    System.out.println("text: " + parse.getText() + "\n");
  }
}
 
Example #20
Source File: TestHTMLLanguageParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * Test parsing of language identifiers from html 
 **/
public void testMetaHTMLParsing() {

  try {
    ParseUtil parser = new ParseUtil(NutchConfiguration.create());
    /* loop through the test documents and validate result */
    for (int t = 0; t < docs.length; t++) {
      Content content = getContent(docs[t]);
      Parse parse = parser.parse(content).get(content.getUrl());
      assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
    }
  } catch (Exception e) {
    e.printStackTrace(System.out);
    fail(e.toString());
  }

}
 
Example #21
Source File: TestMetatagParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public void testIt() {
  Configuration conf = NutchConfiguration.create();
  
  String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
  
  try {
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    
    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
    
    // check that we get the same values
    Metadata parseMeta = parse.getData().getParseMeta();
    
    assertEquals(description, parseMeta.get("metatag.description"));
    assertEquals(keywords, parseMeta.get("metatag.keywords"));
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.toString());
  }
}
 
Example #22
Source File: HTMLLanguageParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/** Try to find the document's language from page headers and metadata */
private String detectLanguage(Parse page, DocumentFragment doc) {
    String lang = getLanguageFromMetadata(page.getData().getParseMeta());
    if (lang == null) {
        LanguageParser parser = new LanguageParser(doc);
        lang = parser.getLanguage();
    }

    if (lang != null) {
        return lang;
    }

    lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);

    return lang;
}
 
Example #23
Source File: FeedParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * Runs a command line version of this {@link Parser}.
 * 
 * @param args
 *          A single argument (expected at arg[0]) representing a path on the
 *          local filesystem that points to a feed file.
 * 
 * @throws Exception
 *           If any error occurs.
 */
public static void main(String[] args) throws Exception {
  if (args.length != 1) {
    System.err.println("Usage: FeedParser <feed>");
    System.exit(1);
  }
  String name = args[0];
  String url = "file:" + name;
  Configuration conf = NutchConfiguration.create();
  FeedParser parser = new FeedParser();
  parser.setConf(conf);
  File file = new File(name);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
      "application/rss+xml", new Metadata(), conf));
  for (Entry<Text, Parse> entry : parseResult) {
    System.out.println("key: " + entry.getKey());
    Parse parse = entry.getValue();
    System.out.println("data: " + parse.getData());
    System.out.println("text: " + parse.getText() + "\n");
  }
}
 
Example #24
Source File: S2jhIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
@Override
public NutchDocument filterInternal(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) {
    ParseData parseData = parse.getData();

    String sku = parseData.getMeta("sku");
    if (StringUtils.isBlank(sku)) {
        return null;
    }

    doc.add("sku", sku);
    doc.add("price", parseData.getMeta("price"));

    return doc;
}
 
Example #25
Source File: HTMLLanguageParser.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Scan the HTML document looking at possible indications of content
 * language<br>
 * <li>1. html lang attribute
 * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta
 * dc.language
 * (http://dublincore.org/documents/2000/07/16/usageguide/qualified
 * -html.shtml#language) <li>3. meta http-equiv (content-language)
 * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br>
 */
public ParseResult filter(Content content, ParseResult parseResult,
        HTMLMetaTags metaTags, DocumentFragment doc) {
    String lang = null;

    Parse parse = parseResult.get(content.getUrl());

    if (detect >= 0 && identify < 0) {
        lang = detectLanguage(parse, doc);
    } else if (detect < 0 && identify >= 0) {
        lang = identifyLanguage(parse);
    } else if (detect < identify) {
        lang = detectLanguage(parse, doc);
        if (lang == null) {
            lang = identifyLanguage(parse);
        }
    } else if (identify < detect) {
        lang = identifyLanguage(parse);
        if (lang == null) {
            lang = detectLanguage(parse, doc);
        }
    } else {
        LOG.warn("No configuration for language extraction policy is provided");
        return parseResult;
    }

    if (lang != null) {
        parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
        return parseResult;
    }

    return parseResult;
}
 
Example #26
Source File: TestOOParser.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();
  Protocol protocol;
  ProtocolFactory factory = new ProtocolFactory(conf);

  System.out.println("Expected : "+expectedText);
  
  for (int i=0; i<sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    if (sampleFiles[i].startsWith("ootest")==false) continue;
    
    protocol = factory.getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    
    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();

    // simply test for the presence of a text - the ordering of the elements may differ from what was expected
    // in the previous tests
    assertTrue(text!=null && text.length() > 0);
    
    System.out.println("Found "+sampleFiles[i]+": "+text);
  }
}
 
Example #27
Source File: AnchorIndexingFilter.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * The {@link AnchorIndexingFilter} filter object which supports boolean 
 * configuration settings for the deduplication of anchors. 
 * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
 *  
 * @param doc The {@link NutchDocument} object
 * @param parse The relevant {@link Parse} object passing through the filter 
 * @param url URL to be filtered for anchor text
 * @param datum The {@link CrawlDatum} entry
 * @param inlinks The {@link Inlinks} containing anchor text
 * @return filtered NutchDocument
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
  Inlinks inlinks) throws IndexingException {

  String[] anchors = (inlinks != null ? inlinks.getAnchors()
    : new String[0]);

  HashSet<String> set = null;

  for (int i = 0; i < anchors.length; i++) {
    if (deduplicate) {
      if (set == null) set = new HashSet<String>();
      String lcAnchor = anchors[i].toLowerCase();

      // Check if already processed the current anchor
      if (!set.contains(lcAnchor)) {
        doc.add("anchor", anchors[i]);

        // Add to map
        set.add(lcAnchor);
      }
    } else {
      doc.add("anchor", anchors[i]);
    }
  }

  return doc;
}
 
Example #28
Source File: AnchorIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * The {@link AnchorIndexingFilter} filter object which supports boolean 
 * configuration settings for the deduplication of anchors. 
 * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
 *  
 * @param doc The {@link NutchDocument} object
 * @param parse The relevant {@link Parse} object passing through the filter 
 * @param url URL to be filtered for anchor text
 * @param datum The {@link CrawlDatum} entry
 * @param inlinks The {@link Inlinks} containing anchor text
 * @return filtered NutchDocument
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
  Inlinks inlinks) throws IndexingException {

  String[] anchors = (inlinks != null ? inlinks.getAnchors()
    : new String[0]);

  HashSet<String> set = null;

  for (int i = 0; i < anchors.length; i++) {
    if (deduplicate) {
      if (set == null) set = new HashSet<String>();
      String lcAnchor = anchors[i].toLowerCase();

      // Check if already processed the current anchor
      if (!set.contains(lcAnchor)) {
        doc.add("anchor", anchors[i]);

        // Add to map
        set.add(lcAnchor);
      }
    } else {
      doc.add("anchor", anchors[i]);
    }
  }

  return doc;
}
 
Example #29
Source File: LanguageDetectionFilter.java    From weslang with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf == null) {
		throw new IndexingException("Not Yet Initialization.");
	}
	if (cause != null) {
		throw new IndexingException("Initialization Failed.", cause);
	}

	String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
	if (lang == null) {
		StringBuilder text = new StringBuilder();
		text.append(parse.getData().getTitle()).append(" ")
				.append(parse.getText());
		try {
			Detector detector = DetectorFactory.create();
			detector.setMaxTextLength(textsize_upper_limit);
			detector.append(text.toString());
			lang = detector.detect();
		} catch (LangDetectException e) {
			throw new IndexingException("Detection failed.", e);
		}
	}
	if (lang == null) lang = "unknown";

	doc.add("lang", lang);
	return doc;
}
 
Example #30
Source File: HTMLLanguageParser.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/** Use statistical language identification to extract page language */
private String identifyLanguage(Parse parse) {
    StringBuilder text = new StringBuilder();
    if (parse == null)
        return null;

    String title = parse.getData().getTitle();
    if (title != null) {
        text.append(title.toString());
    }

    String content = parse.getText();
    if (content != null) {
        text.append(" ").append(content.toString());
    }

    // trim content?
    String titleandcontent = text.toString();

    if (this.contentMaxlength != -1
            && titleandcontent.length() > this.contentMaxlength)
        titleandcontent = titleandcontent.substring(0, contentMaxlength);

    LanguageIdentifier identifier = new LanguageIdentifier(titleandcontent);

    if (onlyCertain) {
        if (identifier.isReasonablyCertain())
            return identifier.getLanguage();
        else
            return null;
    }
    return identifier.getLanguage();
}