org.jsoup.Jsoup Java Examples

The following examples show how to use org.jsoup.Jsoup. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParseTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test
public void testNewsHomepage() throws IOException {
    File in = getFile("/htmltests/news-com-au-home.html");
    Document doc = Jsoup.parse(in, "UTF-8", "http://www.news.com.au/");
    assertEquals("News.com.au | News from Australia and around the world online | NewsComAu", doc.title());
    assertEquals("Brace yourself for Metro meltdown", doc.select(".id1225817868581 h4").text().trim());

    Element a = doc.select("a[href=/entertainment/horoscopes]").first();
    assertEquals("/entertainment/horoscopes", a.attr("href"));
    assertEquals("http://www.news.com.au/entertainment/horoscopes", a.attr("abs:href"));

    Element hs = doc.select("a[href*=naughty-corners-are-a-bad-idea]").first();
    assertEquals(
            "http://www.heraldsun.com.au/news/naughty-corners-are-a-bad-idea-for-kids/story-e6frf7jo-1225817899003",
            hs.attr("href"));
    assertEquals(hs.attr("href"), hs.attr("abs:href"));
}
 
Example #2
Source File: ElementTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test
public void testRemoveBeforeIndex() {
	Document doc = Jsoup.parse(
            "<html><body><div><p>before1</p><p>before2</p><p>XXX</p><p>after1</p><p>after2</p></div></body></html>",
            "");
    Element body = doc.select("body").first();
    Elements elems = body.select("p:matchesOwn(XXX)");
    Element xElem = elems.first();
    Elements beforeX = xElem.parent().getElementsByIndexLessThan(xElem.elementSiblingIndex());

    for(Element p : beforeX) {
        p.remove();
    }

    assertEquals("<body><div><p>XXX</p><p>after1</p><p>after2</p></div></body>", TextUtil.stripNewlines(body.outerHtml()));
}
 
Example #3
Source File: ElementTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test public void testClone() {
    Document doc = Jsoup.parse("<div><p>One<p><span>Two</div>");

    Element p = doc.select("p").get(1);
    Element clone = p.clone();

    assertNull(clone.parent()); // should be orphaned
    assertEquals(0, clone.siblingIndex);
    assertEquals(1, p.siblingIndex);
    assertNotNull(p.parent());

    clone.append("<span>Three");
    assertEquals("<p><span>Two</span><span>Three</span></p>", TextUtil.stripNewlines(clone.outerHtml()));
    assertEquals("<div><p>One</p><p><span>Two</span></p></div>", TextUtil.stripNewlines(doc.body().html())); // not modified

    doc.body().appendChild(clone); // adopt
    assertNotNull(clone.parent());
    assertEquals("<div><p>One</p><p><span>Two</span></p></div><p><span>Two</span><span>Three</span></p>", TextUtil.stripNewlines(doc.body().html()));
}
 
Example #4
Source File: CleanerTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test public void testIsValidBodyHtml() {
    String ok = "<p>Test <b><a href='http://example.com/' rel='nofollow'>OK</a></b></p>";
    String ok1 = "<p>Test <b><a href='http://example.com/'>OK</a></b></p>"; // missing enforced is OK because still needs run thru cleaner
    String nok1 = "<p><script></script>Not <b>OK</b></p>";
    String nok2 = "<p align=right>Test Not <b>OK</b></p>";
    String nok3 = "<!-- comment --><p>Not OK</p>"; // comments and the like will be cleaned
    String nok4 = "<html><head>Foo</head><body><b>OK</b></body></html>"; // not body html
    String nok5 = "<p>Test <b><a href='http://example.com/' rel='nofollowme'>OK</a></b></p>";
    String nok6 = "<p>Test <b><a href='http://example.com/'>OK</b></p>"; // missing close tag
    String nok7 = "</div>What";
    assertTrue(Jsoup.isValid(ok, Whitelist.basic()));
    assertTrue(Jsoup.isValid(ok1, Whitelist.basic()));
    assertFalse(Jsoup.isValid(nok1, Whitelist.basic()));
    assertFalse(Jsoup.isValid(nok2, Whitelist.basic()));
    assertFalse(Jsoup.isValid(nok3, Whitelist.basic()));
    assertFalse(Jsoup.isValid(nok4, Whitelist.basic()));
    assertFalse(Jsoup.isValid(nok5, Whitelist.basic()));
    assertFalse(Jsoup.isValid(nok6, Whitelist.basic()));
    assertFalse(Jsoup.isValid(ok, Whitelist.none()));
    assertFalse(Jsoup.isValid(nok7, Whitelist.basic()));
}
 
Example #5
Source File: AlbumRequest.java    From meizhi with Apache License 2.0 6 votes vote down vote up
@Override
protected Response<List<Image>> parseNetworkResponse(NetworkResponse response) {
    try {
        List<Image> images = new ArrayList<>();

        Document document = Jsoup.parse(new String(response.data,
                HttpHeaderParser.parseCharset(response.headers)));

        for (Element img : document.select(".container.main .box.show-box img")) {
            String url = img.attr("src");
            if (TextUtils.isEmpty(url)) {
                continue;
            }

            Image image = new Image();
            image.url = url;

            images.add(image);
        }

        return Response.success(images, HttpHeaderParser.parseCacheHeaders(response));
    } catch (UnsupportedEncodingException e) {
        return Response.error(new ParseError(e));
    }
}
 
Example #6
Source File: SteamWebHandler.java    From UpdogFarmer with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Unlock Steam parental controls with a pin
 */
private String unlockParental(String pin) {
    final String url = STEAM_STORE + "parental/ajaxunlock";
    try {
        final Map<String,String> responseCookies = Jsoup.connect(url)
                .referrer(STEAM_STORE)
                .followRedirects(true)
                .ignoreContentType(true)
                .cookies(generateWebCookies())
                .data("pin", pin)
                .method(Connection.Method.POST)
                .execute()
                .cookies();
        return responseCookies.get("steamparental");
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}
 
Example #7
Source File: SourcePrinterTest.java    From warnings-ng-plugin with MIT License 6 votes vote down vote up
@Test
void shouldCreateSourceWithoutLineNumber() {
    SourcePrinter printer = new SourcePrinter();

    IssueBuilder builder = new IssueBuilder();
    Issue issue = builder.build();

    Document document = Jsoup.parse(printer.render(asStream("format-java.txt"), issue,
            NO_DESCRIPTION, ICON_URL));
    String expectedFile = toString("format-java.txt");

    assertThat(document.text()).isEqualToIgnoringWhitespace(expectedFile);

    Elements pre = document.getElementsByTag("pre");
    assertThat(pre.text()).isEqualToIgnoringWhitespace(expectedFile);
}
 
Example #8
Source File: Book.java    From nju-lib-downloader with GNU General Public License v3.0 6 votes vote down vote up
public List<Node> getOutline() throws IOException {
    for (int i = 0; i < 20; i++) {
        try {
            String url = CoreService.baseUrl + "/book/getDirectoryTree.jsps?bookId=" + idInt + "&type=PDF";
            //http://sxnju.chineseall.cn/book/getDirectoryTree.jsps?bookId=10060602592&type=PDF&_=1504844448871
            String result = MyHttpRequest.get(url, null, "UTF-8", 3000);

            result = new ObjectMapper().readValue(result, ObjectNode.class).get("data").textValue();

            Document doc = Jsoup.parse(result);
            Elements elements = doc.select("ul[id=directoryTree]");
            return parseUL(elements.get(0));
        } catch (Exception e) {
            if (i == 19) {
                throw e;
            }
        }

    }
    return null;
}
 
Example #9
Source File: IPUtils.java    From superword with Apache License 2.0 6 votes vote down vote up
public static List<String> getIPLocation(String ip){
    List<String> locations = new ArrayList<>();
    try {
        Elements elements = Jsoup
                .parse(new URL("http://ip138.com/ips138.asp?ip=" + ip), 60000)
                .select("ul li");
        for(Element element : elements){
            String text = element.text();
            if(StringUtils.isNotBlank(text)){
                String[] attrs = text.split(":");
                if(attrs != null && attrs.length == 2){
                    locations.add(attrs[1]);
                }
            }
        }
    }catch (Exception e){
        LOG.error("获取IP地址的地理位置", e);
    }
    return locations;
}
 
Example #10
Source File: TextFilterManage.java    From bbs with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * 读取上传图片路径名称
 * @param html
 * @param item 项目
 * @return
 */
public List<String> readImageName(String html,String item) {
	//上传图片文件名称
	List<String> imageNameList = new ArrayList<String>();
	if(!StringUtils.isBlank(html)){
		Document doc = Jsoup.parseBodyFragment(html);

		//图片
		Elements image_elements = doc.select("img[src]");  
		for (Element element : image_elements) {
			 String imageUrl = element.attr("src"); 
			 if(StringUtils.startsWithIgnoreCase(imageUrl, "file/"+item+"/")){
				 
				 imageNameList.add(imageUrl);
             }
		}
	}
	return imageNameList;
}
 
Example #11
Source File: ElementTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test
public void testAppendTo() {
	String parentHtml = "<div class='a'></div>";
	String childHtml = "<div class='b'></div><p>Two</p>";

	Document parentDoc = Jsoup.parse(parentHtml);
	Element parent = parentDoc.body();
       Document childDoc = Jsoup.parse(childHtml);

       Element div = childDoc.select("div").first();
       Element p = childDoc.select("p").first();
       Element appendTo1 = div.appendTo(parent);
       assertEquals(div, appendTo1);

       Element appendTo2 = p.appendTo(div);
       assertEquals(p, appendTo2);

       assertEquals("<div class=\"a\"></div>\n<div class=\"b\">\n <p>Two</p>\n</div>", parentDoc.body().html());
       assertEquals("", childDoc.body().html()); // got moved out
}
 
Example #12
Source File: WordClassifierForOxford.java    From superword with Apache License 2.0 6 votes vote down vote up
public static String getContent(String word) {
    String url = OXFORD + word + "?renovate=" + (new Random(System.currentTimeMillis()).nextInt(899999)+100000);
    LOGGER.debug("url:"+url);
    Connection conn = Jsoup.connect(url)
            .header("Accept", ACCEPT)
            .header("Accept-Encoding", ENCODING)
            .header("Accept-Language", LANGUAGE)
            .header("Connection", CONNECTION)
            .header("Referer", REFERER)
            .header("Host", HOST)
            .header("User-Agent", USER_AGENT)
            .timeout(60000)
            .ignoreContentType(true);
    String html = "";
    try {
        html = conn.post().html();
        html = html.replaceAll("[\n\r]", "");
    }catch (Exception e){
        //LOGGER.error("获取URL:"+url+"页面出错", e);
        LOGGER.error("获取URL:"+url+"页面出错");
    }
    return html;
}
 
Example #13
Source File: NewService.java    From Pixiv-Illustration-Collection-Backend with Apache License 2.0 6 votes vote down vote up
private void pullACGMHNews() throws IOException, InterruptedException {
    HttpRequest request = HttpRequest.newBuilder()
            .uri(URI.create("https://www.acgmh.com/category/news")).POST(HttpRequest.BodyPublishers.ofString("type=catL3&paged=1")).build();
    String body = httpClient.send(request, HttpResponse.BodyHandlers.ofString()).body();
    //ACGMHNewsDTO acgmhNewsDTO = objectMapper.readValue(body, ACGMHNewsDTO.class);
    Document doc = Jsoup.parse(body);
    Elements elements = doc.getElementsByClass("pos-r pd10 post-list box mar10-b content");
    List<ACGNew> acgNewList = elements.stream().map(e -> {
        String style = e.getElementsByClass("preview thumb-in").get(0).attr("style");
        String cover = style.substring(style.indexOf("('") + 2, style.length() - 2);
        String author = e.getElementsByClass("users").text();
        String createDate = e.getElementsByClass("timeago").text();
        Elements es = e.getElementsByClass("entry-title");
        String title = es.text();
        String refererUrl = es.get(0).getElementsByTag("a").get(0).attr("href");
        String intro = e.getElementsByClass("mar10-b post-ex mar10-t mobile-hide").text();
        return new ACGNew(title, intro, author, cover, refererUrl, LocalDate.parse(createDate.substring(0, 10)), NewsCrawlerConstant.ACGMH);
    }).collect(Collectors.toList());
    process(acgNewList, "id", "content-innerText");
}
 
Example #14
Source File: CDTClassifierEvaluation.java    From NLIWOD with GNU Affero General Public License v3.0 6 votes vote down vote up
public static ArrayList<String> loadSystemR(String system){
	Path datapath = Paths.get("./src/main/resources/QALD6MultilingualLogs/multilingual_" + system + ".html");
	ArrayList<String> result = Lists.newArrayList();

	try{
		String loadedData = Files.lines(datapath).collect(Collectors.joining()); 
		Document doc = Jsoup.parse(loadedData);
		Element table = doc.select("table").get(5);
		Elements tableRows = table.select("tr");
		for(Element row: tableRows){
			Elements tableEntry = row.select("td");
			result.add(tableEntry.get(1).ownText());
		}
		result.remove(0); //remove the head of the table
		return result;
	}catch(IOException e){
		e.printStackTrace();
		log.debug("loading failed.");
		return result;
	}
}
 
Example #15
Source File: FuckBroDomain.java    From TrackRay with GNU General Public License v3.0 6 votes vote down vote up
public Map<String,String> aizhanIcp(String domain){
    HashMap<String, String> map = new HashMap<>();
    HttpClient httpClient = new HttpClient();
    String url = "https://icp.aizhan.com/%s/";
    try {
        ResponseStatus responseStatus = httpClient.get(String.format(url, domain));
        String html = responseStatus.getContent();
        if (!html.contains("未找到") && html.contains("该单位备案网站") && html.contains("缓存于"))
        {
            Document doc = Jsoup.parse(html);

            Elements trs = doc.select("div#company .table-s1 tbody tr");
            for (Element tr : trs) {
                String title = tr.select("td").get(1).text();
                String dom = tr.select("td").get(2).text();
                map.put(dom,title);
            }
        }
    } catch (Exception e) {
        task.getExceptions().add(e);
    }
    SysLog.info("ICP反查结束");
    return map;
}
 
Example #16
Source File: Class.java    From nju-lib-downloader with GNU General Public License v3.0 5 votes vote down vote up
public static int getBookSizeFromHtml(String html){
    Document doc= Jsoup.parse(html);
    Elements sizeNode=doc.select("input[id=totalSize]");
    if(sizeNode!=null&&sizeNode.size()>0){
        String sizeString=sizeNode.attr("value");
        if(sizeString!=null){
            int sizeInt= Integer.parseInt(sizeString);
            return sizeInt;
        }
    }
    return -1;
}
 
Example #17
Source File: ElementTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void testChildrenElements() {
    String html = "<div><p><a>One</a></p><p><a>Two</a></p>Three</div><span>Four</span><foo></foo><img>";
    Document doc = Jsoup.parse(html);
    Element div = doc.select("div").first();
    Element p = doc.select("p").first();
    Element span = doc.select("span").first();
    Element foo = doc.select("foo").first();
    Element img = doc.select("img").first();

    Elements docChildren = div.children();
    assertEquals(2, docChildren.size());
    assertEquals("<p><a>One</a></p>", docChildren.get(0).outerHtml());
    assertEquals("<p><a>Two</a></p>", docChildren.get(1).outerHtml());
    assertEquals(3, div.childNodes().size());
    assertEquals("Three", div.childNodes().get(2).outerHtml());

    assertEquals(1, p.children().size());
    assertEquals("One", p.children().text());

    assertEquals(0, span.children().size());
    assertEquals(1, span.childNodes().size());
    assertEquals("Four", span.childNodes().get(0).outerHtml());

    assertEquals(0, foo.children().size());
    assertEquals(0, foo.childNodes().size());
    assertEquals(0, img.children().size());
    assertEquals(0, img.childNodes().size());
}
 
Example #18
Source File: StatusReportGenerationErrorTest.java    From kubernetes-elastic-agents with Apache License 2.0 5 votes vote down vote up
@Test
public void shouldGenerateErrorViewForException() {
    final StatusReportGenerationException exception = StatusReportGenerationException.noRunningPod("foo");

    final GoPluginApiResponse response = StatusReportGenerationErrorHandler.handle(PluginStatusReportViewBuilder.instance(), exception);

    assertThat(response.responseCode(), is(200));

    final String view = new JsonParser().parse(response.responseBody()).getAsJsonObject().get("view").getAsString();
    final Document document = Jsoup.parse(view);

    assertThat(document.select(".outer-container .container .error-container blockquote header").text(), is("Pod is not running."));
    assertThat(document.select(".outer-container .container .error-container blockquote p").text(), is("Can not find a running pod for the provided elastic agent id 'foo'."));
}
 
Example #19
Source File: BaseElementSelector.java    From zongtui-webcrawler with GNU General Public License v2.0 5 votes vote down vote up
@Override
public String select(String text) {
    if (text != null) {
        return select(Jsoup.parse(text));
    }
    return null;
}
 
Example #20
Source File: ElementsTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void empty() {
    Document doc = Jsoup.parse("<div><p>Hello <b>there</b></p> <p>now!</p></div>");
    doc.outputSettings().prettyPrint(false);

    doc.select("p").empty();
    assertEquals("<div><p></p> <p></p></div>", doc.body().html());
}
 
Example #21
Source File: ParagraphMarkedClassificationTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Test
public void testMarking() {
  Document doc = Jsoup.parseBodyFragment("<p>(UK OFFICIAL)This is some text</p>");
  m.manipulate(doc);

  assertEquals(
      MarkupUtils.getAttribute(doc.body().select("p").first(), "classification"), "UK OFFICIAL");
  assertEquals(doc.body().text(), "This is some text");
}
 
Example #22
Source File: PreviewTextUtils.java    From mblog with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 获取文章中的img url
 * @param html 代码
 * @return string
 */
public static List<String> extractImage(String html) {
    List<String> urls = new ArrayList<>();
    if (html == null)
        return urls;
    Document doc = Jsoup.parseBodyFragment(html);
    Elements images = doc.select("img");
    if (null != images) {
        for(Element el : images) {
            urls.add(el.attr("src"));
        }
    }
    return urls;
}
 
Example #23
Source File: OnnmyoujiSpider.java    From SpringBootUnity with MIT License 5 votes vote down vote up
/**
 * 获取御魂信息详情页连接
 */
private static List<String> getMitamaDetailInfoUrl() {
    List<String> list = new ArrayList<>();
    String html = HttpUtil.get(URL);
    Document doc = Jsoup.parse(html);
    Element select = doc.select(".heroList-1").get(0);
    Elements liElement = select.select("a");
    for (Element element : liElement) {
        String href = element.attr("href");
        list.add(href);
    }
    return list;
}
 
Example #24
Source File: ElementTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void testChildrenElements() {
    String html = "<div><p><a>One</a></p><p><a>Two</a></p>Three</div><span>Four</span><foo></foo><img>";
    Document doc = Jsoup.parse(html);
    Element div = doc.select("div").first();
    Element p = doc.select("p").first();
    Element span = doc.select("span").first();
    Element foo = doc.select("foo").first();
    Element img = doc.select("img").first();

    Elements docChildren = div.children();
    assertEquals(2, docChildren.size());
    assertEquals("<p><a>One</a></p>", docChildren.get(0).outerHtml());
    assertEquals("<p><a>Two</a></p>", docChildren.get(1).outerHtml());
    assertEquals(3, div.childNodes().size());
    assertEquals("Three", div.childNodes().get(2).outerHtml());

    assertEquals(1, p.children().size());
    assertEquals("One", p.children().text());

    assertEquals(0, span.children().size());
    assertEquals(1, span.childNodes().size());
    assertEquals("Four", span.childNodes().get(0).outerHtml());

    assertEquals(0, foo.children().size());
    assertEquals(0, foo.childNodes().size());
    assertEquals(0, img.children().size());
    assertEquals(0, img.childNodes().size());
}
 
Example #25
Source File: HtmlParserTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void parsesUnterminatedTextarea() {
    // don't parse right to end, but break on <p>
    Document doc = Jsoup.parse("<body><p><textarea>one<p>two");
    Element t = doc.select("textarea").first();
    assertEquals("one", t.text());
    assertEquals("two", doc.select("p").get(1).text());
}
 
Example #26
Source File: UrlConnectTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test
public void throwsIfRequestBodyForGet() throws IOException {
    boolean caught = false;
    String url = "https://jsoup.org";
    try {
        Document doc = Jsoup.connect(url).requestBody("fail").get();
    } catch (IllegalArgumentException e) {
        caught = true;
    }
    assertTrue(caught);
}
 
Example #27
Source File: ParseMeiZiTu.java    From v9porn with MIT License 5 votes vote down vote up
public static BaseResult<List<String>> parsePicturePage(String html) {
    BaseResult<List<String>> baseResult = new BaseResult<>();

    Document doc = Jsoup.parse(html);

    Element pageElement = doc.getElementsByClass("pagenavi").first();

    Elements aElements = pageElement.select("a");
    int totalPage = 1;
    if (aElements != null && aElements.size() > 3) {
        String pageStr = aElements.get(aElements.size() - 2).text();
        if (!TextUtils.isEmpty(pageStr) && TextUtils.isDigitsOnly(pageStr)) {
            totalPage = Integer.parseInt(pageStr);
        }
    }

    List<String> imageUrlList = new ArrayList<>();

    String imageUrl = doc.getElementsByClass("main-image").first().selectFirst("img").attr("src");
    if (totalPage == 1) {
        imageUrlList.add(imageUrl);
    }
    for (int i = 1; i < totalPage + 1; i++) {
        String tmp;
        if (i < 10) {
            tmp = imageUrl.replace("01.", "0" + i + ".");
        } else {
            tmp = imageUrl.replace("01.", "" + i + ".");
        }
        imageUrlList.add(tmp);
    }
    baseResult.setData(imageUrlList);
    return baseResult;
}
 
Example #28
Source File: Header.java    From viritin with Apache License 2.0 5 votes vote down vote up
private void render() {
    if (text != null) {
        setContentMode(ContentMode.HTML);
        StringBuilder sb = new StringBuilder("<h");
        sb.append(headerLevel);
        sb.append(">");
        sb.append(Jsoup.clean(text, getWhitelist()));
        sb.append("</h");
        sb.append(headerLevel);
        sb.append(">");
        super.setValue(sb.toString());
        text = null;
    }
}
 
Example #29
Source File: XmlTreeBuilderTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test
public void testSupplyParserToJsoupClass() {
    String xml = "<doc><val>One<val>Two</val></bar>Three</doc>";
    Document doc = Jsoup.parse(xml, "http://foo.com/", Parser.xmlParser());
    assertEquals("<doc><val>One<val>Two</val>Three</val></doc>",
            TextUtil.stripNewlines(doc.html()));
}
 
Example #30
Source File: SelectorTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void testById() {
    Elements els = Jsoup.parse("<div><p id=foo>Hello</p><p id=foo>Foo two!</p></div>").select("#foo");
    assertEquals(2, els.size());
    assertEquals("Hello", els.get(0).text());
    assertEquals("Foo two!", els.get(1).text());

    Elements none = Jsoup.parse("<div id=1></div>").select("#foo");
    assertEquals(0, none.size());
}