crawlercommons.sitemaps.UnknownFormatException Java Examples
The following examples show how to use
crawlercommons.sitemaps.UnknownFormatException.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SiteMapParser.java From vividus with Apache License 2.0 | 6 votes |
private Collection<SiteMapURL> parse(URI siteMapUrl, crawlercommons.sitemaps.SiteMapParser siteMapParser) throws SiteMapParseException { try { HttpClientContext context = new HttpClientContext(); HttpResponse response = httpClient.doHttpGet(siteMapUrl, context); URI cleanSiteMapUrl = UriUtils.removeUserInfo(getBaseUri(context, siteMapUrl)); AbstractSiteMap siteMap = siteMapParser.parseSiteMap(response.getResponseBody(), cleanSiteMapUrl.toURL()); if (siteMap.getType() == SitemapType.INDEX) { List<SiteMapURL> siteMapUrls = new LinkedList<>(); for (AbstractSiteMap siteMapFromIndex : ((SiteMapIndex) siteMap).getSitemaps()) { siteMapUrls.addAll(parse(siteMapFromIndex.getUrl().toURI(), siteMapParser)); } return siteMapUrls; } return ((SiteMap) siteMap).getSiteMapUrls(); } catch (IOException | UnknownFormatException | URISyntaxException e) { throw new SiteMapParseException(e.getMessage(), e); } }
Example #2
Source File: NewsSiteMapParserTest.java From news-crawl with Apache License 2.0 | 5 votes |
@Test public void testSiteMapParser() throws IOException, UnknownFormatException { String url = "https://example.org/sitemap-news.xml"; byte[] content = readContent("sitemap-news.xml"); String contentType = ""; Metadata parentMetadata = new Metadata(); List<Outlink> links = new ArrayList<>(); SitemapType type = bolt.detectContent(url, content); assertEquals(SitemapType.NEWS, type); bolt.parseSiteMap(url, content, contentType, parentMetadata, links); // unmodified sitemap: // - publication date is far in the past, link should be skipped // <news:publication_date>2008-12-23</news:publication_date> assertEquals("Outdated link not skipped", 0, links.size()); // now set the publication date to yesterday LocalDateTime yesterday = LocalDateTime.now().minusDays(1); content = (new String(content, StandardCharsets.UTF_8)).replace( "<news:publication_date>2008-12-23</news:publication_date>", "<news:publication_date>" + yesterday.format( DateTimeFormatter.ofPattern("yyyy-MM-dd")) + "</news:publication_date>") .getBytes(StandardCharsets.UTF_8); bolt.parseSiteMap(url, content, contentType, parentMetadata, links); assertEquals( "Expected one <loc> and one additional <xhtml:link> link - image links are ignored", 2, links.size()); }
Example #3
Source File: DelegatorHandler.java From crawler-commons with Apache License 2.0 | 4 votes |
protected void setException(UnknownFormatException exception) { this.exception = exception; }
Example #4
Source File: DelegatorHandler.java From crawler-commons with Apache License 2.0 | 4 votes |
public UnknownFormatException getException() { return exception; }
Example #5
Source File: DelegatorHandler.java From crawler-commons with Apache License 2.0 | 4 votes |
private void startRootElement(String uri, String localName, String qName, Attributes attributes) { elementStack.push(localName); if ("feed".equals(localName)) { delegate = new AtomHandler(url, elementStack, strict); } // See if it is a RSS feed by looking for the localName "channel" // element. This avoids the issue of having the outer tag named // <rdf:RDF> that was causing this code to fail. Inside of // the <rss> or <rdf> tag is a <channel> tag, so we can use that. // See https://github.com/crawler-commons/crawler-commons/issues/87 // and also RSS 1.0 specification http://web.resource.org/rss/1.0/spec else if ("channel".equals(localName)) { delegate = new RSSHandler(url, elementStack, strict); } else if ("sitemapindex".equals(localName)) { delegate = new XMLIndexHandler(url, elementStack, strict); } else if ("urlset".equals(localName)) { delegate = new XMLHandler(url, elementStack, strict); } else { LOG.debug("Skipped unknown root element <{}> in {}", localName, url); return; } // configure delegate delegate.setStrictNamespace(isStrictNamespace()); delegate.setAcceptedNamespaces(acceptedNamespaces); // validate XML namespace if (isStrictNamespace()) { if (delegate instanceof AtomHandler || delegate instanceof RSSHandler) { // no namespace checking for feeds return; } if (!isAcceptedNamespace(uri) && uri.startsWith("/")) { // first, try to resolve relative namespace URI (deprecated but // not forbidden), e.g., //www.sitemaps.org/schemas/sitemap/0.9 try { URL u = new URL(url, uri); uri = u.toString(); } catch (MalformedURLException e) { LOG.warn("Failed to resolve relative namespace URI {} in sitemap {}", uri, url); } } if (!isAcceptedNamespace(uri)) { String msg; if (!Namespace.isSupported(uri)) { msg = "Unsupported namespace <" + uri + ">"; } else { msg = "Namespace <" + uri + "> not accepted"; } setException(new UnknownFormatException(msg)); delegate = null; return; } } delegate.setExtensionNamespaces(extensionNamespaces); }