Java Code Examples for org.jsoup.nodes.Document#getElementsByTag()

The following examples show how to use org.jsoup.nodes.Document#getElementsByTag() . These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
public static ArrayList<Lyrics> search(String query) {
    ArrayList<Lyrics> results = new ArrayList<>();
    try {
        String url = "http://api.chartlyrics.com/apiv1.asmx/SearchLyricText?lyricText=";
        url += URLEncoder.encode(query, "UTF-8");
        Document doc = Jsoup.parse(url, null);
        Elements elements = doc.getElementsByTag("SearchLyricResult");
        for (Element element : elements) {
            String id = element.getElementsByTag("TrackId").get(0).text();
            String checksum = element.getElementsByTag("TrackChecksum").get(0).text();
            Lyrics lyrics = new Lyrics(Lyrics.SEARCH_ITEM);
            lyrics.setArtist(element.getElementsByTag("artist").get(0).text());
            lyrics.setTitle(element.getElementsByTag("song").get(0).text());
            lyrics.setURL("http://api.chartlyrics.com/apiv1.asmx/GetLyric?lyricId=" + id + "&lyricCheckSum=" + checksum);
            results.add(lyrics);
        }
        return results;
    } catch (Exception e) {
        if (!BuildConfig.DEBUG && !(e instanceof IOException)) {
            e.printStackTrace();
        }
    }

    return new ArrayList<>();
}
 
Example 2
Source Project: a   File: WebDavFile.java    License: GNU General Public License v3.0 6 votes vote down vote up
private List<WebDavFile> parseDir(String s) {
    List<WebDavFile> list = new ArrayList<>();
    Document document = Jsoup.parse(s);
    Elements elements = document.getElementsByTag("d:response");
    String baseUrl = getUrl().endsWith("/") ? getUrl() : getUrl() + "/";
    for (Element element : elements) {
        String href = element.getElementsByTag("d:href").get(0).text();
        if (!href.endsWith("/")) {
            String fileName = href.substring(href.lastIndexOf("/") + 1);
            WebDavFile webDavFile;
            try {
                webDavFile = new WebDavFile(baseUrl + fileName);
                webDavFile.setDisplayName(fileName);
                webDavFile.setUrlName(href);
                list.add(webDavFile);
            } catch (MalformedURLException e) {
                e.printStackTrace();
            }
        }
    }
    return list;
}
 
Example 3
/**
 * 加载子分类。仅加载一层子分类,即子分类的子分类不会被加载。
 * 当该方法被调用时,会向服务器查询该分类的子分类并更新该对象的{@link #children}
 * <p>
 * 如需递归加载子分类,调用{@link #loadAllChild()}
 *
 * @throws IOException 从服务器查询子节点出错
 */
public void loadChild() throws IOException {
    if (!isTerminal()) {
        checkCookie();
        String Url = NJULib.baseUrl + "/classifyview";
        String data = "fenlei=" + this.getId() + "&lib=markbook";
        String result = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "UTF-8", 1000);
        // System.out.println(result);
        Document doc = Jsoup.parse(result);
        Elements li = doc.getElementsByTag("li");
        for (Element bookClassId : li) {
            String id = bookClassId.attr("id");
            String name = bookClassId.getElementsByTag("a").text();
            boolean hasSubTree = bookClassId.getElementsByTag("img").attr("onClick").contains("getSubTree");
            //System.out.println(id+" "+NJULib.decodeUrlUnicode(name));
            BookClass child = hasSubTree ? new BookClass(id, MyDecoder.decodeUrlUnicode(name), this) :
                    new TerminalBookClass(id, MyDecoder.decodeUrlUnicode(name), this);
            child.setCookie(cookie);
            this.addChild(child);
        }
        this.isLoaded = true;
    }
}
 
Example 4
Source Project: warnings-ng-plugin   File: SourcePrinterTest.java    License: MIT License 6 votes vote down vote up
@Test
@org.jvnet.hudson.test.Issue("JENKINS-55679")
void shouldRenderXmlFiles() {
    SourcePrinter printer = new SourcePrinter();

    IssueBuilder builder = new IssueBuilder();
    Issue issue = builder.build();

    Document document = Jsoup.parse(printer.render(asStream("format.xml"), issue,
            NO_DESCRIPTION, ICON_URL));
    String expectedFile = toString("format.xml");

    assertThat(document.text()).isEqualToIgnoringWhitespace(expectedFile);

    Elements pre = document.getElementsByTag("pre");
    assertThat(pre.text()).isEqualToIgnoringWhitespace(expectedFile);
}
 
Example 5
@Override
public Map<String, ArrayList<String>> getSystemAvailableAgentList() throws InsightsCustomException {
	Map<String, ArrayList<String>> agentDetails = new TreeMap<>();
	if (!ApplicationConfigProvider.getInstance().getAgentDetails().isOnlineRegistration()) {
		agentDetails = getOfflineSystemAvailableAgentList();
	} else {
		String url = ApplicationConfigProvider.getInstance().getAgentDetails().getDocrootUrl();
		Document doc;
		try {
			doc = Jsoup.connect(url).get();
			Elements rows = doc.getElementsByTag("a");
			for (Element element : rows) {
				if (null != element.text() && element.text().startsWith("v")) {
					String version = StringUtils.stripEnd(element.text(), "/");
					ArrayList<String> toolJson = getAgents(version);
					agentDetails.put(version, toolJson);
				}
			}
		} catch (IOException e) {
			log.error("Error while getting system agent list ", e);
			throw new InsightsCustomException(e.toString());
		}
	}
	return agentDetails;
}
 
Example 6
private String addResourcesInMessage(final MimeMessageHelper mailMessage, final String htmlText) throws Exception {
    final Document document = Jsoup.parse(htmlText);

    final List<String> resources = new ArrayList<>();

    final Elements imageElements = document.getElementsByTag("img");
    resources.addAll(imageElements.stream()
            .filter(imageElement -> imageElement.hasAttr("src"))
            .filter(imageElement -> !imageElement.attr("src").startsWith("http"))
            .map(imageElement -> {
                final String src = imageElement.attr("src");
                imageElement.attr("src", "cid:" + src);
                return src;
            })
            .collect(Collectors.toList()));

    final String html = document.html();
    mailMessage.setText(html, true);

    for (final String res : resources) {
        final FileSystemResource templateResource = new FileSystemResource(new File(templatesPath, res));
        mailMessage.addInline(res, templateResource, getContentTypeByFileName(res));
    }

    return html;
}
 
Example 7
Source Project: jbake   File: HtmlUtil.java    License: MIT License 6 votes vote down vote up
/**
 * Image paths are specified as w.r.t. assets folder. This function prefix site host to all img src except
 * the ones that starts with http://, https://.
 * <p>
 * If image path starts with "./", i.e. relative to the source file, then it first replace that with output file directory and the add site host.
 *
 * @param fileContents  Map representing file contents
 * @param configuration Configuration object
 */
public static void fixImageSourceUrls(Map<String, Object> fileContents, JBakeConfiguration configuration) {
    String htmlContent = fileContents.get(Attributes.BODY).toString();
    boolean prependSiteHost = configuration.getImgPathPrependHost();
    String siteHost = configuration.getSiteHost();
    String uri = getDocumentUri(fileContents);

    Document document = Jsoup.parseBodyFragment(htmlContent);
    Elements allImgs = document.getElementsByTag("img");

    for (Element img : allImgs) {
        transformImageSource(img, uri, siteHost, prependSiteHost);
    }

    //Use body().html() to prevent adding <body></body> from parsed fragment.
    fileContents.put(Attributes.BODY, document.body().html());
}
 
Example 8
/**
 * 加载子分类。仅加载一层子分类,即子分类的子分类不会被加载。
 * 当该方法被调用时,会向服务器查询该分类的子分类并更新该对象的{@link #children}
 * <p>
 * 如需递归加载子分类,调用{@link #loadAllChild()}
 *
 * @throws IOException 从服务器查询子节点出错
 */
public void loadChild() throws IOException {
    if (!isTerminal()) {
        checkCookie();
        String Url = NJULib.baseUrl + "/classifyview";
        String data = "fenlei=" + this.getId() + "&lib=markbook";
        String result = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "UTF-8", 1000);
        // System.out.println(result);
        Document doc = Jsoup.parse(result);
        Elements li = doc.getElementsByTag("li");
        for (Element bookClassId : li) {
            String id = bookClassId.attr("id");
            String name = bookClassId.getElementsByTag("a").text();
            boolean hasSubTree = bookClassId.getElementsByTag("img").attr("onClick").contains("getSubTree");
            //System.out.println(id+" "+NJULib.decodeUrlUnicode(name));
            BookClass child = hasSubTree ? new BookClass(id, MyDecoder.decodeUrlUnicode(name), this) :
                    new TerminalBookClass(id, MyDecoder.decodeUrlUnicode(name), this);
            child.setCookie(cookie);
            this.addChild(child);
        }
        this.isLoaded = true;
    }
}
 
Example 9
@Test
public void ruby_block_macro_processor_should_be_registered_with_block_name() {

    RubyExtensionRegistry rubyExtensionRegistry = asciidoctor.rubyExtensionRegistry();
    rubyExtensionRegistry.loadClass(getClass().getResourceAsStream("/ruby-extensions/gist-block-macro.rb")).blockMacro("mygist", "GistBlockMacro");

    String content = asciidoctor.convert(
            ".My Gist\n" +
                "mygist::123456[]",
            options().toFile(false).get());

    Document doc = Jsoup.parse(content, "UTF-8");
    Elements elements = doc.getElementsByTag("script");
    assertThat(elements.size(), is(1));
    assertThat(elements.get(0).attr("src"), is("https://gist.github.com/123456.js"));

}
 
Example 10
Source Project: MoeQuest   File: MeiziUtil.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * 解析自拍妹子Html
 */
public List<MeiziTu> parserMeiziTuByAutodyne(String html, String type) {

  List<MeiziTu> list = new ArrayList<>();
  Document doc = Jsoup.parse(html);

  Elements p = doc.getElementsByTag("p");
  MeiziTu meiziTu;
  Element img;
  for (int i = 0; i < 15; i++) {
    meiziTu = new MeiziTu();
    img = p.get(i).select("img").first();
    String src = img.attr("src");
    String title = img.attr("alt");
    meiziTu.setOrder(i);
    meiziTu.setType(type);
    meiziTu.setWidth(0);
    meiziTu.setHeight(0);
    meiziTu.setImageurl(src);
    meiziTu.setTitle(title);
    list.add(meiziTu);
  }
  return list;
}
 
Example 11
private Map<String, String> extractSendTextImage(String sendText) {
    Map<String, String> images = new HashMap<>();
    Document doc = Jsoup.parse(sendText);
    for (Element src : doc.getElementsByTag("img")) {
        String s = src.attr("src");
        if (s.startsWith("http")) {
            continue;
        }
        String tempKey = IDGenerator.MD5.generate();
        src.attr("src", "cid:".concat(tempKey));
        images.put(tempKey, s);
    }
    return images;
}
 
Example 12
Source Project: ripme   File: VscoRipper.java    License: MIT License 5 votes vote down vote up
private String vscoImageToURL(String url) throws IOException{
    Document page = Jsoup.connect(url).userAgent(USER_AGENT)
                                      .get();
    //create Elements filled only with Elements with the "meta" tag.
    Elements metaTags = page.getElementsByTag("meta");
    String result = "";

    for(Element metaTag : metaTags){
        //find URL inside meta-tag with property of "og:image"
        if (metaTag.attr("property").equals("og:image")){
            String givenURL = metaTag.attr("content");
            givenURL = givenURL.replaceAll("\\?h=[0-9]+", "");//replace the "?h=xxx" tag at the end of the URL (where each x is a number)
            
            result = givenURL;
            LOGGER.debug("Found image URL: " + givenURL);
            break;//immediately stop after getting URL (there should only be 1 image to be downloaded)
        }
    }
    
    //Means website changed, things need to be fixed.
    if (result.isEmpty()){
        LOGGER.error("Could not find image URL at: " + url);
    }
    
    return result;
    
}
 
Example 13
Source Project: a   File: EncodingDetect.java    License: GNU General Public License v3.0 5 votes vote down vote up
public static String getEncodeInHtml(@NonNull byte[] bytes) {
    try {
        String charsetStr = "UTF-8";
        Document doc = Jsoup.parse(new String(bytes, charsetStr));
        int a = doc.childNode(0).toString().indexOf("encoding");
        if (a > 0) {
            String e = doc.childNode(0).toString().substring(a);
            int b = e.indexOf('"');
            int c = e.indexOf('"', b + 1);
            return e.substring(b + 1, c);
        }
        Elements metaTags = doc.getElementsByTag("meta");
        for (Element metaTag : metaTags) {
            String content = metaTag.attr("content");
            String http_equiv = metaTag.attr("http-equiv");
            charsetStr = metaTag.attr("charset");
            if (!charsetStr.isEmpty()) {
                if (!isEmpty(charsetStr)) {
                    return charsetStr;
                }
            }
            if (http_equiv.toLowerCase().equals("content-type")) {
                if (content.toLowerCase().contains("charset")) {
                    charsetStr = content.substring(content.toLowerCase().indexOf("charset") + "charset=".length());
                } else {
                    charsetStr = content.substring(content.toLowerCase().indexOf(";") + 1);
                }
                if (!isEmpty(charsetStr)) {
                    return charsetStr;
                }
            }
        }
    } catch (Exception ignored) {
    }
    return getJavaEncode(bytes);
}
 
Example 14
Source Project: snowflake-jdbc   File: SessionUtil.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Extracts post back url from the HTML returned by the IDP
 *
 * @param html The HTML that we are parsing to find the post back url
 * @return The post back url
 */
static private String getPostBackUrlFromHTML(String html)
{
  Document doc = Jsoup.parse(html);
  Elements e1 = doc.getElementsByTag("body");
  Elements e2 = e1.get(0).getElementsByTag("form");
  return e2.first().attr("action");
}
 
Example 15
@Test
public void should_skip_front_matter_if_specified_by_skip_front_matter_attribute()
        throws IOException {

    Attributes attributes = attributes().skipFrontMatter(true).get();
    Options options = options().toFile(false).inPlace(false).attributes(attributes).get();

    String content = asciidoctor.convertFile(classpath.getResource("renderwithfrontmatter.adoc"), options);
    Document doc = Jsoup.parse(content, "UTF-8");
    Elements hrElements = doc.getElementsByTag("hr");

    assertThat(hrElements.size(), is(0));

}
 
Example 16
Source Project: zap-extensions   File: VulnChecker.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Give a list of securiteam.com search links for the key word introduced
 *
 * @param appName
 * @param version
 * @return
 * @throws Exception
 */
public static ArrayList<String> fromSecuritiTeam(String appName, String version)
        throws Exception {
    ArrayList<String> results = new ArrayList<String>();
    URL url =
            new URL(
                    "http://www.securiteam.com/cgi-bin/htsearch?words="
                            + appName
                            + "+"
                            + version);

    WebPage wp = new WebPage(url);
    Document doc = wp.getDocument();
    if (doc.outerHtml().contains("No matches were found for")) {
        System.out.println("No Results Found");

    } else {
        // System.out.println(doc.getElementsByTag("dl"));
        for (Element elt : doc.getElementsByTag("dl")) {
            String link = elt.getElementsByTag("a").get(0).attr("href");
            // for the moment i return just links
            System.out.println(link);
            results.add(link + "\n");
            /*wp = new WebPage(new URL(link));
            doc = wp.getDocument();
            for(Element e:doc.getAllElements()){

            }
            String fields = doc.getElementsMatchingOwnText("Vulnerable Systems:").get(0).parent().text();
            System.out.println(fields.replaceAll("Protect your website!.*vulnerability-scanner", ""));
            */
        }
    }
    return results;
}
 
Example 17
Source Project: templatespider   File: ResourceQuote.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * 替换 img 标签
 * @param doc
 * @return
 */
public Document imgTag(Document doc){
	Elements imgElements = doc.getElementsByTag("img");
	for (int i = 0; i < imgElements.size(); i++) {
		Element e = imgElements.get(i);
		String url = e.attr("src");
		String absUrl = hierarchyReplace(this.baseUri, url);
		if(!url.equals(absUrl)){
			e.attr("src", absUrl);
		}
	}
	return doc;
}
 
Example 18
Source Project: WordPressHelper   File: FeedParser.java    License: MIT License 4 votes vote down vote up
@Override
protected Object doInBackground(Object[] params) {
    try {
        Document document = Jsoup.connect(FEED_URL)
                .userAgent("Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22")
                .timeout(60000).ignoreContentType(true).get();
        Elements elements = document.getElementsByTag("item");
        for (Element element : elements) {
            FeedItem feedItem = new FeedItem();

            //get all simple information
            feedItem.setTitle(element.getElementsByTag("title").first().text());
            feedItem.setPubDate(element.getElementsByTag("pubDate").first().text());
            feedItem.setCreator(element.getElementsByTag("dc:creator").first().text());
            feedItem.setDescription(element.getElementsByTag("description").first().text());
            feedItem.setContent(element.getElementsByTag("content:encoded").first().text());
            feedItem.setCommentRss(element.getElementsByTag("wfw:commentRss").first().text());
            feedItem.setComments(element.getElementsByTag("slash:comments").first().text());
            feedItem.setLink(element.select("link").first().nextSibling().toString().trim());
            feedItem.setGuid(element.getElementsByTag("guid").first().text());

            //get first image
            Document document1 = Jsoup.parse(element.getElementsByTag("content:encoded").first().text());
            Elements elements1 = document1.select("img");
            feedItem.setImage(elements1.attr("src"));

            //get all category
            Elements elements2 = element.getElementsByTag("category");
            ArrayList<String> category = new ArrayList<>();
            for (int i = 0; i < elements2.size(); i++) {
                category.add(element.getElementsByTag("category").get(i).text());
            }
            feedItem.setCategory(category);
            //get id
            String idPost[] = element.getElementsByTag("guid").first().text().split("p=");
            if (idPost.length > 1) {
                feedItem.setId(idPost[1]);
                //add feeditem to arraylist
                feedItems.add(feedItem);
            }

        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return null;
}
 
Example 19
Source Project: mamute   File: CustomVRaptorIntegration.java    License: Apache License 2.0 4 votes vote down vote up
protected Elements getElementsByTag(String html, String tagName) {
	Document document = Jsoup.parse(html);
	return document.getElementsByTag(tagName);
}
 
Example 20
Source Project: ripme   File: ThechiveRipper.java    License: MIT License 4 votes vote down vote up
private List<String> getUrlsFromThechive(Document doc) {
    /*
     * The image urls are stored in a <script> tag of the document. This script
     * contains a single array var by name CHIVE_GALLERY_ITEMS.
     * 
     * We grab all the <img> tags from the particular script, combine them in a
     * string, parse it, and grab all the img/gif urls.
     * 
     */
    List<String> result = new ArrayList<>();
    Elements scripts = doc.getElementsByTag("script");

    for (Element script : scripts) {
        String data = script.data();

        if (!data.contains("CHIVE_GALLERY_ITEMS")) {
            continue;
        }

        /*
         * We add all the <img/> tags in a single StringBuilder and parse as HTML for
         * easy sorting of img/ gifs.
         */
        StringBuilder allImgTags = new StringBuilder();
        Matcher matcher = imagePattern.matcher(data);
        while (matcher.find()) {
            // Unescape '\' from the img tags, which also unescape's img url as well.
            allImgTags.append(matcher.group(0).replaceAll("\\\\", ""));
        }

        // Now we parse and sort links.
        Document imgDoc = Jsoup.parse(allImgTags.toString());
        Elements imgs = imgDoc.getElementsByTag("img");
        for (Element img : imgs) {
            if (img.hasAttr("data-gifsrc")) {
                // For gifs.
                result.add(img.attr("data-gifsrc"));
            } else {
                // For jpeg images.
                result.add(img.attr("src"));
            }
        }
    }

    // strip all GET parameters from the links( such as quality, width, height as to
    // get the original image.).
    result.replaceAll(s -> s.substring(0, s.indexOf("?")));

    return result;
}