Java Code Examples for org.jsoup.nodes.Document#getElementsByTag()

The following examples show how to use org.jsoup.nodes.Document#getElementsByTag() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: SourcePrinterTest.java From warnings-ng-plugin with MIT License

6 votes

@Test
@org.jvnet.hudson.test.Issue("JENKINS-55679")
void shouldRenderXmlFiles() {
    SourcePrinter printer = new SourcePrinter();

    IssueBuilder builder = new IssueBuilder();
    Issue issue = builder.build();

    Document document = Jsoup.parse(printer.render(asStream("format.xml"), issue,
            NO_DESCRIPTION, ICON_URL));
    String expectedFile = toString("format.xml");

    assertThat(document.text()).isEqualToIgnoringWhitespace(expectedFile);

    Elements pre = document.getElementsByTag("pre");
    assertThat(pre.text()).isEqualToIgnoringWhitespace(expectedFile);
}

Example 2

Source File: MeiziUtil.java From MoeQuest with Apache License 2.0

6 votes

/**
 * 解析自拍妹子Html
 */
public List<MeiziTu> parserMeiziTuByAutodyne(String html, String type) {

  List<MeiziTu> list = new ArrayList<>();
  Document doc = Jsoup.parse(html);

  Elements p = doc.getElementsByTag("p");
  MeiziTu meiziTu;
  Element img;
  for (int i = 0; i < 15; i++) {
    meiziTu = new MeiziTu();
    img = p.get(i).select("img").first();
    String src = img.attr("src");
    String title = img.attr("alt");
    meiziTu.setOrder(i);
    meiziTu.setType(type);
    meiziTu.setWidth(0);
    meiziTu.setHeight(0);
    meiziTu.setImageurl(src);
    meiziTu.setTitle(title);
    list.add(meiziTu);
  }
  return list;
}

Example 3

Source File: WhenRubyExtensionIsRegistered.java From asciidoctorj with Apache License 2.0

6 votes

@Test
public void ruby_block_macro_processor_should_be_registered_with_block_name() {

    RubyExtensionRegistry rubyExtensionRegistry = asciidoctor.rubyExtensionRegistry();
    rubyExtensionRegistry.loadClass(getClass().getResourceAsStream("/ruby-extensions/gist-block-macro.rb")).blockMacro("mygist", "GistBlockMacro");

    String content = asciidoctor.convert(
            ".My Gist\n" +
                "mygist::123456[]",
            options().toFile(false).get());

    Document doc = Jsoup.parse(content, "UTF-8");
    Elements elements = doc.getElementsByTag("script");
    assertThat(elements.size(), is(1));
    assertThat(elements.get(0).attr("src"), is("https://gist.github.com/123456.js"));

}

Example 4

Source File: BookClass.java From nju-lib-downloader with GNU General Public License v3.0

6 votes

/**
 * 加载子分类。仅加载一层子分类，即子分类的子分类不会被加载。
 * 当该方法被调用时，会向服务器查询该分类的子分类并更新该对象的{@link #children}
 * <p>
 * 如需递归加载子分类，调用{@link #loadAllChild()}
 *
 * @throws IOException 从服务器查询子节点出错
 */
public void loadChild() throws IOException {
    if (!isTerminal()) {
        checkCookie();
        String Url = NJULib.baseUrl + "/classifyview";
        String data = "fenlei=" + this.getId() + "&lib=markbook";
        String result = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "UTF-8", 1000);
        // System.out.println(result);
        Document doc = Jsoup.parse(result);
        Elements li = doc.getElementsByTag("li");
        for (Element bookClassId : li) {
            String id = bookClassId.attr("id");
            String name = bookClassId.getElementsByTag("a").text();
            boolean hasSubTree = bookClassId.getElementsByTag("img").attr("onClick").contains("getSubTree");
            //System.out.println(id+" "+NJULib.decodeUrlUnicode(name));
            BookClass child = hasSubTree ? new BookClass(id, MyDecoder.decodeUrlUnicode(name), this) :
                    new TerminalBookClass(id, MyDecoder.decodeUrlUnicode(name), this);
            child.setCookie(cookie);
            this.addChild(child);
        }
        this.isLoaded = true;
    }
}

Example 5

Source File: HtmlUtil.java From jbake with MIT License

6 votes

/**
 * Image paths are specified as w.r.t. assets folder. This function prefix site host to all img src except
 * the ones that starts with http://, https://.
 * <p>
 * If image path starts with "./", i.e. relative to the source file, then it first replace that with output file directory and the add site host.
 *
 * @param fileContents  Map representing file contents
 * @param configuration Configuration object
 */
public static void fixImageSourceUrls(Map<String, Object> fileContents, JBakeConfiguration configuration) {
    String htmlContent = fileContents.get(Attributes.BODY).toString();
    boolean prependSiteHost = configuration.getImgPathPrependHost();
    String siteHost = configuration.getSiteHost();
    String uri = getDocumentUri(fileContents);

    Document document = Jsoup.parseBodyFragment(htmlContent);
    Elements allImgs = document.getElementsByTag("img");

    for (Element img : allImgs) {
        transformImageSource(img, uri, siteHost, prependSiteHost);
    }

    //Use body().html() to prevent adding <body></body> from parsed fragment.
    fileContents.put(Attributes.BODY, document.body().html());
}

Example 6

Source File: EmailServiceImpl.java From gravitee-management-rest-api with Apache License 2.0

6 votes

private String addResourcesInMessage(final MimeMessageHelper mailMessage, final String htmlText) throws Exception {
    final Document document = Jsoup.parse(htmlText);

    final List<String> resources = new ArrayList<>();

    final Elements imageElements = document.getElementsByTag("img");
    resources.addAll(imageElements.stream()
            .filter(imageElement -> imageElement.hasAttr("src"))
            .filter(imageElement -> !imageElement.attr("src").startsWith("http"))
            .map(imageElement -> {
                final String src = imageElement.attr("src");
                imageElement.attr("src", "cid:" + src);
                return src;
            })
            .collect(Collectors.toList()));

    final String html = document.html();
    mailMessage.setText(html, true);

    for (final String res : resources) {
        final FileSystemResource templateResource = new FileSystemResource(new File(templatesPath, res));
        mailMessage.addInline(res, templateResource, getContentTypeByFileName(res));
    }

    return html;
}

Example 7

Source File: AgentManagementServiceImpl.java From Insights with Apache License 2.0

6 votes

@Override
public Map<String, ArrayList<String>> getSystemAvailableAgentList() throws InsightsCustomException {
	Map<String, ArrayList<String>> agentDetails = new TreeMap<>();
	if (!ApplicationConfigProvider.getInstance().getAgentDetails().isOnlineRegistration()) {
		agentDetails = getOfflineSystemAvailableAgentList();
	} else {
		String url = ApplicationConfigProvider.getInstance().getAgentDetails().getDocrootUrl();
		Document doc;
		try {
			doc = Jsoup.connect(url).get();
			Elements rows = doc.getElementsByTag("a");
			for (Element element : rows) {
				if (null != element.text() && element.text().startsWith("v")) {
					String version = StringUtils.stripEnd(element.text(), "/");
					ArrayList<String> toolJson = getAgents(version);
					agentDetails.put(version, toolJson);
				}
			}
		} catch (IOException e) {
			log.error("Error while getting system agent list ", e);
			throw new InsightsCustomException(e.toString());
		}
	}
	return agentDetails;
}

Example 8

Source File: LyricsChart.java From QuickLyric with GNU General Public License v3.0

6 votes

public static ArrayList<Lyrics> search(String query) {
    ArrayList<Lyrics> results = new ArrayList<>();
    try {
        String url = "http://api.chartlyrics.com/apiv1.asmx/SearchLyricText?lyricText=";
        url += URLEncoder.encode(query, "UTF-8");
        Document doc = Jsoup.parse(url, null);
        Elements elements = doc.getElementsByTag("SearchLyricResult");
        for (Element element : elements) {
            String id = element.getElementsByTag("TrackId").get(0).text();
            String checksum = element.getElementsByTag("TrackChecksum").get(0).text();
            Lyrics lyrics = new Lyrics(Lyrics.SEARCH_ITEM);
            lyrics.setArtist(element.getElementsByTag("artist").get(0).text());
            lyrics.setTitle(element.getElementsByTag("song").get(0).text());
            lyrics.setURL("http://api.chartlyrics.com/apiv1.asmx/GetLyric?lyricId=" + id + "&lyricCheckSum=" + checksum);
            results.add(lyrics);
        }
        return results;
    } catch (Exception e) {
        if (!BuildConfig.DEBUG && !(e instanceof IOException)) {
            e.printStackTrace();
        }
    }

    return new ArrayList<>();
}

Example 9

Source File: BookClass.java From nju-lib-downloader with GNU General Public License v3.0

6 votes

/**
 * 加载子分类。仅加载一层子分类，即子分类的子分类不会被加载。
 * 当该方法被调用时，会向服务器查询该分类的子分类并更新该对象的{@link #children}
 * <p>
 * 如需递归加载子分类，调用{@link #loadAllChild()}
 *
 * @throws IOException 从服务器查询子节点出错
 */
public void loadChild() throws IOException {
    if (!isTerminal()) {
        checkCookie();
        String Url = NJULib.baseUrl + "/classifyview";
        String data = "fenlei=" + this.getId() + "&lib=markbook";
        String result = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "UTF-8", 1000);
        // System.out.println(result);
        Document doc = Jsoup.parse(result);
        Elements li = doc.getElementsByTag("li");
        for (Element bookClassId : li) {
            String id = bookClassId.attr("id");
            String name = bookClassId.getElementsByTag("a").text();
            boolean hasSubTree = bookClassId.getElementsByTag("img").attr("onClick").contains("getSubTree");
            //System.out.println(id+" "+NJULib.decodeUrlUnicode(name));
            BookClass child = hasSubTree ? new BookClass(id, MyDecoder.decodeUrlUnicode(name), this) :
                    new TerminalBookClass(id, MyDecoder.decodeUrlUnicode(name), this);
            child.setCookie(cookie);
            this.addChild(child);
        }
        this.isLoaded = true;
    }
}

Example 10

Source File: WebDavFile.java From a with GNU General Public License v3.0

6 votes

private List<WebDavFile> parseDir(String s) {
    List<WebDavFile> list = new ArrayList<>();
    Document document = Jsoup.parse(s);
    Elements elements = document.getElementsByTag("d:response");
    String baseUrl = getUrl().endsWith("/") ? getUrl() : getUrl() + "/";
    for (Element element : elements) {
        String href = element.getElementsByTag("d:href").get(0).text();
        if (!href.endsWith("/")) {
            String fileName = href.substring(href.lastIndexOf("/") + 1);
            WebDavFile webDavFile;
            try {
                webDavFile = new WebDavFile(baseUrl + fileName);
                webDavFile.setDisplayName(fileName);
                webDavFile.setUrlName(href);
                list.add(webDavFile);
            } catch (MalformedURLException e) {
                e.printStackTrace();
            }
        }
    }
    return list;
}

Example 11

Source File: WhenAttributesAreUsedInAsciidoctor.java From asciidoctorj with Apache License 2.0

5 votes

@Test
public void should_skip_front_matter_if_specified_by_skip_front_matter_attribute()
        throws IOException {

    Attributes attributes = attributes().skipFrontMatter(true).get();
    Options options = options().toFile(false).inPlace(false).attributes(attributes).get();

    String content = asciidoctor.convertFile(classpath.getResource("renderwithfrontmatter.adoc"), options);
    Document doc = Jsoup.parse(content, "UTF-8");
    Elements hrElements = doc.getElementsByTag("hr");

    assertThat(hrElements.size(), is(0));

}

Example 12

Source File: SessionUtil.java From snowflake-jdbc with Apache License 2.0

5 votes

/**
 * Extracts post back url from the HTML returned by the IDP
 *
 * @param html The HTML that we are parsing to find the post back url
 * @return The post back url
 */
static private String getPostBackUrlFromHTML(String html)
{
  Document doc = Jsoup.parse(html);
  Elements e1 = doc.getElementsByTag("body");
  Elements e2 = e1.get(0).getElementsByTag("form");
  return e2.first().attr("action");
}

Example 13

Source File: VulnChecker.java From zap-extensions with Apache License 2.0

5 votes

/**
 * Give a list of securiteam.com search links for the key word introduced
 *
 * @param appName
 * @param version
 * @return
 * @throws Exception
 */
public static ArrayList<String> fromSecuritiTeam(String appName, String version)
        throws Exception {
    ArrayList<String> results = new ArrayList<String>();
    URL url =
            new URL(
                    "http://www.securiteam.com/cgi-bin/htsearch?words="
                            + appName
                            + "+"
                            + version);

    WebPage wp = new WebPage(url);
    Document doc = wp.getDocument();
    if (doc.outerHtml().contains("No matches were found for")) {
        System.out.println("No Results Found");

    } else {
        // System.out.println(doc.getElementsByTag("dl"));
        for (Element elt : doc.getElementsByTag("dl")) {
            String link = elt.getElementsByTag("a").get(0).attr("href");
            // for the moment i return just links
            System.out.println(link);
            results.add(link + "\n");
            /*wp = new WebPage(new URL(link));
            doc = wp.getDocument();
            for(Element e:doc.getAllElements()){

            }
            String fields = doc.getElementsMatchingOwnText("Vulnerable Systems:").get(0).parent().text();
            System.out.println(fields.replaceAll("Protect your website!.*vulnerability-scanner", ""));
            */
        }
    }
    return results;
}

Example 14

Source File: EncodingDetect.java From a with GNU General Public License v3.0

5 votes

public static String getEncodeInHtml(@NonNull byte[] bytes) {
    try {
        String charsetStr = "UTF-8";
        Document doc = Jsoup.parse(new String(bytes, charsetStr));
        int a = doc.childNode(0).toString().indexOf("encoding");
        if (a > 0) {
            String e = doc.childNode(0).toString().substring(a);
            int b = e.indexOf('"');
            int c = e.indexOf('"', b + 1);
            return e.substring(b + 1, c);
        }
        Elements metaTags = doc.getElementsByTag("meta");
        for (Element metaTag : metaTags) {
            String content = metaTag.attr("content");
            String http_equiv = metaTag.attr("http-equiv");
            charsetStr = metaTag.attr("charset");
            if (!charsetStr.isEmpty()) {
                if (!isEmpty(charsetStr)) {
                    return charsetStr;
                }
            }
            if (http_equiv.toLowerCase().equals("content-type")) {
                if (content.toLowerCase().contains("charset")) {
                    charsetStr = content.substring(content.toLowerCase().indexOf("charset") + "charset=".length());
                } else {
                    charsetStr = content.substring(content.toLowerCase().indexOf(";") + 1);
                }
                if (!isEmpty(charsetStr)) {
                    return charsetStr;
                }
            }
        }
    } catch (Exception ignored) {
    }
    return getJavaEncode(bytes);
}

Example 15

Source File: VscoRipper.java From ripme with MIT License

5 votes

private String vscoImageToURL(String url) throws IOException{
    Document page = Jsoup.connect(url).userAgent(USER_AGENT)
                                      .get();
    //create Elements filled only with Elements with the "meta" tag.
    Elements metaTags = page.getElementsByTag("meta");
    String result = "";

    for(Element metaTag : metaTags){
        //find URL inside meta-tag with property of "og:image"
        if (metaTag.attr("property").equals("og:image")){
            String givenURL = metaTag.attr("content");
            givenURL = givenURL.replaceAll("\\?h=[0-9]+", "");//replace the "?h=xxx" tag at the end of the URL (where each x is a number)
            
            result = givenURL;
            LOGGER.debug("Found image URL: " + givenURL);
            break;//immediately stop after getting URL (there should only be 1 image to be downloaded)
        }
    }
    
    //Means website changed, things need to be fixed.
    if (result.isEmpty()){
        LOGGER.error("Could not find image URL at: " + url);
    }
    
    return result;
    
}

Example 16

Source File: ResourceQuote.java From templatespider with Apache License 2.0

5 votes

/**
 * 替换 img 标签
 * @param doc
 * @return
 */
public Document imgTag(Document doc){
	Elements imgElements = doc.getElementsByTag("img");
	for (int i = 0; i < imgElements.size(); i++) {
		Element e = imgElements.get(i);
		String url = e.attr("src");
		String absUrl = hierarchyReplace(this.baseUri, url);
		if(!url.equals(absUrl)){
			e.attr("src", absUrl);
		}
	}
	return doc;
}

Example 17

Source File: DefaultEmailNotifier.java From jetlinks-community with Apache License 2.0

5 votes

private Map<String, String> extractSendTextImage(String sendText) {
    Map<String, String> images = new HashMap<>();
    Document doc = Jsoup.parse(sendText);
    for (Element src : doc.getElementsByTag("img")) {
        String s = src.attr("src");
        if (s.startsWith("http")) {
            continue;
        }
        String tempKey = IDGenerator.MD5.generate();
        src.attr("src", "cid:".concat(tempKey));
        images.put(tempKey, s);
    }
    return images;
}

Example 18

Source File: CustomVRaptorIntegration.java From mamute with Apache License 2.0

4 votes

protected Elements getElementsByTag(String html, String tagName) {
	Document document = Jsoup.parse(html);
	return document.getElementsByTag(tagName);
}

Example 19

Source File: ThechiveRipper.java From ripme with MIT License

4 votes

private List<String> getUrlsFromThechive(Document doc) {
    /*
     * The image urls are stored in a <script> tag of the document. This script
     * contains a single array var by name CHIVE_GALLERY_ITEMS.
     * 
     * We grab all the <img> tags from the particular script, combine them in a
     * string, parse it, and grab all the img/gif urls.
     * 
     */
    List<String> result = new ArrayList<>();
    Elements scripts = doc.getElementsByTag("script");

    for (Element script : scripts) {
        String data = script.data();

        if (!data.contains("CHIVE_GALLERY_ITEMS")) {
            continue;
        }

        /*
         * We add all the <img/> tags in a single StringBuilder and parse as HTML for
         * easy sorting of img/ gifs.
         */
        StringBuilder allImgTags = new StringBuilder();
        Matcher matcher = imagePattern.matcher(data);
        while (matcher.find()) {
            // Unescape '\' from the img tags, which also unescape's img url as well.
            allImgTags.append(matcher.group(0).replaceAll("\\\\", ""));
        }

        // Now we parse and sort links.
        Document imgDoc = Jsoup.parse(allImgTags.toString());
        Elements imgs = imgDoc.getElementsByTag("img");
        for (Element img : imgs) {
            if (img.hasAttr("data-gifsrc")) {
                // For gifs.
                result.add(img.attr("data-gifsrc"));
            } else {
                // For jpeg images.
                result.add(img.attr("src"));
            }
        }
    }

    // strip all GET parameters from the links( such as quality, width, height as to
    // get the original image.).
    result.replaceAll(s -> s.substring(0, s.indexOf("?")));

    return result;
}

Example 20

Source File: FeedParser.java From WordPressHelper with MIT License

4 votes

@Override
protected Object doInBackground(Object[] params) {
    try {
        Document document = Jsoup.connect(FEED_URL)
                .userAgent("Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22")
                .timeout(60000).ignoreContentType(true).get();
        Elements elements = document.getElementsByTag("item");
        for (Element element : elements) {
            FeedItem feedItem = new FeedItem();

            //get all simple information
            feedItem.setTitle(element.getElementsByTag("title").first().text());
            feedItem.setPubDate(element.getElementsByTag("pubDate").first().text());
            feedItem.setCreator(element.getElementsByTag("dc:creator").first().text());
            feedItem.setDescription(element.getElementsByTag("description").first().text());
            feedItem.setContent(element.getElementsByTag("content:encoded").first().text());
            feedItem.setCommentRss(element.getElementsByTag("wfw:commentRss").first().text());
            feedItem.setComments(element.getElementsByTag("slash:comments").first().text());
            feedItem.setLink(element.select("link").first().nextSibling().toString().trim());
            feedItem.setGuid(element.getElementsByTag("guid").first().text());

            //get first image
            Document document1 = Jsoup.parse(element.getElementsByTag("content:encoded").first().text());
            Elements elements1 = document1.select("img");
            feedItem.setImage(elements1.attr("src"));

            //get all category
            Elements elements2 = element.getElementsByTag("category");
            ArrayList<String> category = new ArrayList<>();
            for (int i = 0; i < elements2.size(); i++) {
                category.add(element.getElementsByTag("category").get(i).text());
            }
            feedItem.setCategory(category);
            //get id
            String idPost[] = element.getElementsByTag("guid").first().text().split("p=");
            if (idPost.length > 1) {
                feedItem.setId(idPost[1]);
                //add feeditem to arraylist
                feedItems.add(feedItem);
            }

        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return null;
}