crawlercommons.sitemaps.UnknownFormatException Java Examples

The following examples show how to use crawlercommons.sitemaps.UnknownFormatException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: SiteMapParser.java From vividus with Apache License 2.0

6 votes

private Collection<SiteMapURL> parse(URI siteMapUrl, crawlercommons.sitemaps.SiteMapParser siteMapParser)
        throws SiteMapParseException
{
    try
    {
        HttpClientContext context = new HttpClientContext();
        HttpResponse response = httpClient.doHttpGet(siteMapUrl, context);
        URI cleanSiteMapUrl = UriUtils.removeUserInfo(getBaseUri(context, siteMapUrl));
        AbstractSiteMap siteMap = siteMapParser.parseSiteMap(response.getResponseBody(), cleanSiteMapUrl.toURL());
        if (siteMap.getType() == SitemapType.INDEX)
        {
            List<SiteMapURL> siteMapUrls = new LinkedList<>();
            for (AbstractSiteMap siteMapFromIndex : ((SiteMapIndex) siteMap).getSitemaps())
            {
                siteMapUrls.addAll(parse(siteMapFromIndex.getUrl().toURI(), siteMapParser));
            }
            return siteMapUrls;
        }
        return ((SiteMap) siteMap).getSiteMapUrls();
    }
    catch (IOException | UnknownFormatException | URISyntaxException e)
    {
        throw new SiteMapParseException(e.getMessage(), e);
    }
}

Example #2

Source File: NewsSiteMapParserTest.java From news-crawl with Apache License 2.0

5 votes

@Test
public void testSiteMapParser() throws IOException, UnknownFormatException {
    String url = "https://example.org/sitemap-news.xml";
    byte[] content = readContent("sitemap-news.xml");
    String contentType = "";
    Metadata parentMetadata = new Metadata();
    List<Outlink> links = new ArrayList<>();

    SitemapType type = bolt.detectContent(url, content);
    assertEquals(SitemapType.NEWS, type);

    bolt.parseSiteMap(url, content, contentType, parentMetadata, links);

    // unmodified sitemap:
    // - publication date is far in the past, link should be skipped
    // <news:publication_date>2008-12-23</news:publication_date>
    assertEquals("Outdated link not skipped", 0, links.size());

    // now set the publication date to yesterday
    LocalDateTime yesterday = LocalDateTime.now().minusDays(1);
    content = (new String(content, StandardCharsets.UTF_8)).replace(
            "<news:publication_date>2008-12-23</news:publication_date>",
            "<news:publication_date>"
                    + yesterday.format(
                            DateTimeFormatter.ofPattern("yyyy-MM-dd"))
                    + "</news:publication_date>")
            .getBytes(StandardCharsets.UTF_8);
    bolt.parseSiteMap(url, content, contentType, parentMetadata, links);
    assertEquals(
            "Expected one <loc> and one additional <xhtml:link> link - image links are ignored",
            2, links.size());
}

Example #3

Source File: DelegatorHandler.java From crawler-commons with Apache License 2.0

4 votes

protected void setException(UnknownFormatException exception) {
    this.exception = exception;
}

Example #4

Source File: DelegatorHandler.java From crawler-commons with Apache License 2.0

4 votes

public UnknownFormatException getException() {
    return exception;
}

Example #5

Source File: DelegatorHandler.java From crawler-commons with Apache License 2.0

4 votes

private void startRootElement(String uri, String localName, String qName, Attributes attributes) {
    elementStack.push(localName);

    if ("feed".equals(localName)) {
        delegate = new AtomHandler(url, elementStack, strict);
    }
    // See if it is a RSS feed by looking for the localName "channel"
    // element. This avoids the issue of having the outer tag named
    // <rdf:RDF> that was causing this code to fail. Inside of
    // the <rss> or <rdf> tag is a <channel> tag, so we can use that.
    // See https://github.com/crawler-commons/crawler-commons/issues/87
    // and also RSS 1.0 specification http://web.resource.org/rss/1.0/spec
    else if ("channel".equals(localName)) {
        delegate = new RSSHandler(url, elementStack, strict);
    } else if ("sitemapindex".equals(localName)) {
        delegate = new XMLIndexHandler(url, elementStack, strict);
    } else if ("urlset".equals(localName)) {
        delegate = new XMLHandler(url, elementStack, strict);
    } else {
        LOG.debug("Skipped unknown root element <{}> in {}", localName, url);
        return;
    }
    // configure delegate
    delegate.setStrictNamespace(isStrictNamespace());
    delegate.setAcceptedNamespaces(acceptedNamespaces);
    // validate XML namespace
    if (isStrictNamespace()) {
        if (delegate instanceof AtomHandler || delegate instanceof RSSHandler) {
            // no namespace checking for feeds
            return;
        }
        if (!isAcceptedNamespace(uri) && uri.startsWith("/")) {
            // first, try to resolve relative namespace URI (deprecated but
            // not forbidden), e.g., //www.sitemaps.org/schemas/sitemap/0.9
            try {
                URL u = new URL(url, uri);
                uri = u.toString();
            } catch (MalformedURLException e) {
                LOG.warn("Failed to resolve relative namespace URI {} in sitemap {}", uri, url);
            }
        }
        if (!isAcceptedNamespace(uri)) {
            String msg;
            if (!Namespace.isSupported(uri)) {
                msg = "Unsupported namespace <" + uri + ">";
            } else {
                msg = "Namespace <" + uri + "> not accepted";
            }
            setException(new UnknownFormatException(msg));
            delegate = null;
            return;
        }
    }
    delegate.setExtensionNamespaces(extensionNamespaces);
}