crawlercommons.sitemaps.UnknownFormatException Java Examples

The following examples show how to use crawlercommons.sitemaps.UnknownFormatException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SiteMapParser.java    From vividus with Apache License 2.0 6 votes vote down vote up
private Collection<SiteMapURL> parse(URI siteMapUrl, crawlercommons.sitemaps.SiteMapParser siteMapParser)
        throws SiteMapParseException
{
    try
    {
        HttpClientContext context = new HttpClientContext();
        HttpResponse response = httpClient.doHttpGet(siteMapUrl, context);
        URI cleanSiteMapUrl = UriUtils.removeUserInfo(getBaseUri(context, siteMapUrl));
        AbstractSiteMap siteMap = siteMapParser.parseSiteMap(response.getResponseBody(), cleanSiteMapUrl.toURL());
        if (siteMap.getType() == SitemapType.INDEX)
        {
            List<SiteMapURL> siteMapUrls = new LinkedList<>();
            for (AbstractSiteMap siteMapFromIndex : ((SiteMapIndex) siteMap).getSitemaps())
            {
                siteMapUrls.addAll(parse(siteMapFromIndex.getUrl().toURI(), siteMapParser));
            }
            return siteMapUrls;
        }
        return ((SiteMap) siteMap).getSiteMapUrls();
    }
    catch (IOException | UnknownFormatException | URISyntaxException e)
    {
        throw new SiteMapParseException(e.getMessage(), e);
    }
}
 
Example #2
Source File: NewsSiteMapParserTest.java    From news-crawl with Apache License 2.0 5 votes vote down vote up
@Test
public void testSiteMapParser() throws IOException, UnknownFormatException {
    String url = "https://example.org/sitemap-news.xml";
    byte[] content = readContent("sitemap-news.xml");
    String contentType = "";
    Metadata parentMetadata = new Metadata();
    List<Outlink> links = new ArrayList<>();

    SitemapType type = bolt.detectContent(url, content);
    assertEquals(SitemapType.NEWS, type);

    bolt.parseSiteMap(url, content, contentType, parentMetadata, links);

    // unmodified sitemap:
    // - publication date is far in the past, link should be skipped
    // <news:publication_date>2008-12-23</news:publication_date>
    assertEquals("Outdated link not skipped", 0, links.size());

    // now set the publication date to yesterday
    LocalDateTime yesterday = LocalDateTime.now().minusDays(1);
    content = (new String(content, StandardCharsets.UTF_8)).replace(
            "<news:publication_date>2008-12-23</news:publication_date>",
            "<news:publication_date>"
                    + yesterday.format(
                            DateTimeFormatter.ofPattern("yyyy-MM-dd"))
                    + "</news:publication_date>")
            .getBytes(StandardCharsets.UTF_8);
    bolt.parseSiteMap(url, content, contentType, parentMetadata, links);
    assertEquals(
            "Expected one <loc> and one additional <xhtml:link> link - image links are ignored",
            2, links.size());
}
 
Example #3
Source File: DelegatorHandler.java    From crawler-commons with Apache License 2.0 4 votes vote down vote up
protected void setException(UnknownFormatException exception) {
    this.exception = exception;
}
 
Example #4
Source File: DelegatorHandler.java    From crawler-commons with Apache License 2.0 4 votes vote down vote up
public UnknownFormatException getException() {
    return exception;
}
 
Example #5
Source File: DelegatorHandler.java    From crawler-commons with Apache License 2.0 4 votes vote down vote up
private void startRootElement(String uri, String localName, String qName, Attributes attributes) {
    elementStack.push(localName);

    if ("feed".equals(localName)) {
        delegate = new AtomHandler(url, elementStack, strict);
    }
    // See if it is a RSS feed by looking for the localName "channel"
    // element. This avoids the issue of having the outer tag named
    // <rdf:RDF> that was causing this code to fail. Inside of
    // the <rss> or <rdf> tag is a <channel> tag, so we can use that.
    // See https://github.com/crawler-commons/crawler-commons/issues/87
    // and also RSS 1.0 specification http://web.resource.org/rss/1.0/spec
    else if ("channel".equals(localName)) {
        delegate = new RSSHandler(url, elementStack, strict);
    } else if ("sitemapindex".equals(localName)) {
        delegate = new XMLIndexHandler(url, elementStack, strict);
    } else if ("urlset".equals(localName)) {
        delegate = new XMLHandler(url, elementStack, strict);
    } else {
        LOG.debug("Skipped unknown root element <{}> in {}", localName, url);
        return;
    }
    // configure delegate
    delegate.setStrictNamespace(isStrictNamespace());
    delegate.setAcceptedNamespaces(acceptedNamespaces);
    // validate XML namespace
    if (isStrictNamespace()) {
        if (delegate instanceof AtomHandler || delegate instanceof RSSHandler) {
            // no namespace checking for feeds
            return;
        }
        if (!isAcceptedNamespace(uri) && uri.startsWith("/")) {
            // first, try to resolve relative namespace URI (deprecated but
            // not forbidden), e.g., //www.sitemaps.org/schemas/sitemap/0.9
            try {
                URL u = new URL(url, uri);
                uri = u.toString();
            } catch (MalformedURLException e) {
                LOG.warn("Failed to resolve relative namespace URI {} in sitemap {}", uri, url);
            }
        }
        if (!isAcceptedNamespace(uri)) {
            String msg;
            if (!Namespace.isSupported(uri)) {
                msg = "Unsupported namespace <" + uri + ">";
            } else {
                msg = "Namespace <" + uri + "> not accepted";
            }
            setException(new UnknownFormatException(msg));
            delegate = null;
            return;
        }
    }
    delegate.setExtensionNamespaces(extensionNamespaces);
}