org.jsoup.select.Elements Java Examples
The following examples show how to use
org.jsoup.select.Elements.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JsoupHTMLLinkStructureExtractor.java From wandora with GNU General Public License v3.0 | 6 votes |
@Override public boolean extractTopicsFrom(Document d, String u, TopicMap t) throws Exception { this.tm = t; this.wandoraClass = getWandoraClassTopic(tm); Topic documentType = getOrCreateTopic(tm, DOC_TYPE, "Document"); makeSubclassOf(tm, documentType, wandoraClass); Topic docTopic = getOrCreateTopic(tm, u); docTopic.addType(documentType); Elements links = d.select("a"); for(Element link: links){ try { parseLink(link,docTopic); } catch (TopicMapException tme) { log(tme.getMessage()); } } return true; }
Example #2
Source File: SourcePrinterTest.java From warnings-ng-plugin with MIT License | 6 votes |
@Test @org.jvnet.hudson.test.Issue("JENKINS-55679") void shouldRenderXmlFiles() { SourcePrinter printer = new SourcePrinter(); IssueBuilder builder = new IssueBuilder(); Issue issue = builder.build(); Document document = Jsoup.parse(printer.render(asStream("format.xml"), issue, NO_DESCRIPTION, ICON_URL)); String expectedFile = toString("format.xml"); assertThat(document.text()).isEqualToIgnoringWhitespace(expectedFile); Elements pre = document.getElementsByTag("pre"); assertThat(pre.text()).isEqualToIgnoringWhitespace(expectedFile); }
Example #3
Source File: Rgaa30Rule050801.java From Asqatasun with GNU Affero General Public License v3.0 | 6 votes |
/** * * @param sspHandler * @param elementHandler * @param elementHandlerWithoutDataTableMarkup */ private void extractTableWithDataTableMarkup( ElementHandler<Element> elementHandler, ElementHandler<Element> elementHandlerWithoutDataTableMarkup) { Elements elementsWithMarkup = new Elements(); for (Element el : elementHandler.get()) { if (el.select(DATA_TABLE_MARKUP_CSS_LIKE_QUERY).size() > 0) { elementsWithMarkup.add(el); } else if (elementHandlerWithoutDataTableMarkup != null) { elementHandlerWithoutDataTableMarkup.add(el); } } elementHandler.clean().addAll(elementsWithMarkup); }
Example #4
Source File: LeaveOneOutCV.java From NLIWOD with GNU Affero General Public License v3.0 | 6 votes |
public static ArrayList<String> loadSystemR(String system){ Path datapath = Paths.get("./src/main/resources/QALD6MultilingualLogs/multilingual_" + system + ".html"); ArrayList<String> result = Lists.newArrayList(); try{ String loadedData = Files.lines(datapath).collect(Collectors.joining()); Document doc = Jsoup.parse(loadedData); Element table = doc.select("table").get(5); Elements tableRows = table.select("tr"); for(Element row: tableRows){ Elements tableEntry = row.select("td"); result.add(tableEntry.get(1).ownText()); } result.remove(0); //remove the head of the table return result; }catch(IOException e){ e.printStackTrace(); log.debug("loading failed."); return result; } }
Example #5
Source File: WhenJavaExtensionIsRegistered.java From asciidoctorj with Apache License 2.0 | 6 votes |
@Test public void a_block_processor_instance_should_be_executed_twice() { JavaExtensionRegistry javaExtensionRegistry = this.asciidoctor.javaExtensionRegistry(); Map<String, Object> config = new HashMap<>(); config.put(Contexts.KEY, Arrays.asList(Contexts.PARAGRAPH)); config.put(ContentModel.KEY, ContentModel.SIMPLE); YellBlock yellBlock = new YellBlock("yell", config); javaExtensionRegistry.block(yellBlock); for (int i = 0; i < 2; i++) { String content = asciidoctor.convertFile( classpath.getResource("sample-with-yell-block.ad"), options().toFile(false).get()); org.jsoup.nodes.Document doc = Jsoup.parse(content, "UTF-8"); Elements elements = doc.getElementsByClass("paragraph"); assertThat(elements.size(), is(1)); assertThat(elements.get(0).text(), is("THE TIME IS NOW. GET A MOVE ON.")); } }
Example #6
Source File: Nanrentu.java From PicKing with Apache License 2.0 | 6 votes |
@Override public String getDetailNext(String baseUrl, String currentUrl, byte[] result) throws UnsupportedEncodingException { Document document = Jsoup.parse(new String(result, "gb2312")); Elements elements = document.select("div.pagelist a:contains(下一页)"); if (elements.size() > 0) { String tempUrl = elements.get(0).attr("href"); if (tempUrl == null || "".equals(tempUrl)) return ""; Pattern pattern = Pattern.compile("http.*/"); Matcher matcher = pattern.matcher(currentUrl); if (matcher.find()) { return matcher.group() + tempUrl; } } return ""; }
Example #7
Source File: MypCardPricer.java From MtgDesktopCompanion with GNU General Public License v3.0 | 6 votes |
private void parsingOffers(String urlC, List<MagicPrice> list) throws IOException { Elements trs = URLTools.extractHtml(urlC).select("table.table tr[data-key]"); for(Element tr : trs) { Elements tds = tr.select("td"); if(tds.isEmpty()) { logger.debug(getName() + " found no offer"); return; } MagicPrice mp = new MagicPrice(); mp.setCountry("Brazil"); mp.setCurrency(Currency.getInstance("BRL")); mp.setSite(getName()); mp.setSeller(tds.get(1).text()); mp.setFoil(tds.get(2).html().equalsIgnoreCase("foil")); mp.setQuality(tds.get(3).html()); mp.setValue(Double.parseDouble(tds.get(5).text().replaceAll("R\\$ ", "").replace(",", "."))); mp.setUrl(urlC); list.add(mp); } logger.debug(getName() + " found " + list.size() + " offers"); }
Example #8
Source File: FreemarkerClientPartialsBasicPropertyTest.java From angularjs-addon with Eclipse Public License 1.0 | 6 votes |
@Test public void testGenerateBasicNumberPropertyWithMinConstraint() throws Exception { Map<String, Object> root = createInspectionResultWrapper(ENTITY_NAME, NUMBER_PROP_WITH_MIN_VAL); Resource<URL> templateResource = resourceFactory.create(getClass().getResource( Deployments.BASE_PACKAGE_PATH + Deployments.BASIC_PROPERTY_DETAIL_INCLUDE)); Template processor = processorFactory.create(templateResource, FreemarkerTemplate.class); String output = processor.process(root); Document html = Jsoup.parseBodyFragment(output); assertThat(output.trim(), not(equalTo(""))); Elements container = html.select("div.form-group"); assertThat(container, notNullValue()); assertThat(container.attr("ng-class"), not(equalTo(""))); Elements formInputElement = html.select("div.form-group input"); assertThat(formInputElement.attr("id"), equalTo("score")); assertThat(formInputElement.attr("type"), equalTo("number")); assertThat(formInputElement.attr("min"), equalTo("0")); assertThat(formInputElement.attr("ng-model"), equalTo(StringUtils.camelCase(ENTITY_NAME) + "." + "score")); }
Example #9
Source File: DataUtil.java From Focus with GNU General Public License v3.0 | 6 votes |
public static String getCollectionItemImageUrl(Collection item){ String content; if (!Strings.isNullOrEmpty(item.getContent())) { content = item.getContent(); }else { content = item.getSummary(); } if (content!=null && !content.equals("")){ Document doc = Jsoup.parse(content); if (doc != null) { Elements images = doc.select("img"); if (images.size() > 0) { return images.get(0).attr("src"); } } } return null; }
Example #10
Source File: WhenJavaExtensionIsRegistered.java From asciidoctorj with Apache License 2.0 | 6 votes |
@Test public void a_extension_registered_as_class_can_get_its_asciidoctor_instance() { JavaExtensionRegistry javaExtensionRegistry = this.asciidoctor.javaExtensionRegistry(); TestBlock.asciidoctor = asciidoctor; Map<String, Object> config = new HashMap<>(); config.put(Contexts.KEY, Arrays.asList(Contexts.LISTING)); config.put(ContentModel.KEY, ContentModel.SIMPLE); javaExtensionRegistry.block("quiet", TestBlock.class); String content = asciidoctor.convert( "[quiet]\nHello World", options().toFile(false).get()); org.jsoup.nodes.Document doc = Jsoup.parse(content, "UTF-8"); Elements elements = doc.getElementsByClass("paragraph"); assertThat(elements.size(), is(1)); assertThat(elements.get(0).text(), is("hello world")); TestBlock.asciidoctor = null; }
Example #11
Source File: FeilongipProxyListPageParser.java From ProxyPool with Apache License 2.0 | 6 votes |
@Override public List<Proxy> parse(String html) { Document document = Jsoup.parse(html); Elements elements = document.select("div[id=j-tab-newprd] table tbody tr"); List<Proxy> proxyList = new ArrayList<>(); for (Element element : elements){ String ip_port = element.select("td:eq(1)").first().text(); String ip = ip_port.split(":")[0]; String port = ip_port.split(":")[1]; String isAnonymous = element.select("td:eq(3)").first().text(); String type = element.select("td:eq(4)").first().text(); if(!anonymousFlag || isAnonymous.contains("匿") || isAnonymous.contains("anonymous")){ proxyList.add(new Proxy(ip, Integer.valueOf(port), type, Constant.TIME_INTERVAL)); } } return proxyList; }
Example #12
Source File: DemoAnnotatedBingCrawler.java From WebCollector with GNU General Public License v3.0 | 6 votes |
@MatchType(types = "searchEngine") public void visitSearchEngine(Page page, CrawlDatums next) { String keyword = page.meta("keyword"); int pageIndex = page.metaAsInt("pageIndex"); System.out.println("成功抓取关键词" + keyword + "的第" + pageIndex + "页搜索结果"); Elements results = page.select("li.b_algo>h2>a"); for (int rank = 0; rank < results.size(); rank++) { Element result = results.get(rank); /* 我们希望继续爬取每条搜索结果指向的网页,这里统称为外链。 我们希望在访问外链时仍然能够知道外链处于搜索引擎的第几页、第几条, 所以将页号和排序信息放入后续的CrawlDatum中,为了能够区分外链和 搜索引擎结果页面,type设置为outlink,这里的值完全由 用户定义,可以设置一个任意的值 */ String href = result.attr("abs:href"); next.addAndReturn(href) .type("outlink") .meta("keyword", keyword) .meta("pageIndex", pageIndex) .meta("rank", rank); } }
Example #13
Source File: WebDavFile.java From a with GNU General Public License v3.0 | 6 votes |
private List<WebDavFile> parseDir(String s) { List<WebDavFile> list = new ArrayList<>(); Document document = Jsoup.parse(s); Elements elements = document.getElementsByTag("d:response"); String baseUrl = getUrl().endsWith("/") ? getUrl() : getUrl() + "/"; for (Element element : elements) { String href = element.getElementsByTag("d:href").get(0).text(); if (!href.endsWith("/")) { String fileName = href.substring(href.lastIndexOf("/") + 1); WebDavFile webDavFile; try { webDavFile = new WebDavFile(baseUrl + fileName); webDavFile.setDisplayName(fileName); webDavFile.setUrlName(href); list.add(webDavFile); } catch (MalformedURLException e) { e.printStackTrace(); } } } return list; }
Example #14
Source File: Text.java From JsoupXpath with Apache License 2.0 | 6 votes |
/** * 函数具体逻辑 * * @param scope 上下文 * @return 计算好的节点 */ @Override public XValue call(Scope scope) { Elements context = scope.context(); List<String> res = new LinkedList<>(); if (context!=null&&context.size()>0){ if (scope.isRecursion()){ NodeTest allTextFun = Scanner.findNodeTestByName("allText"); return allTextFun.call(scope); }else { for (Element e:context){ if ("script".equals(e.nodeName())){ res.add(e.data()); }else { res.add(e.ownText()); } } } } return XValue.create(res); }
Example #15
Source File: CDTClassifierMultilable.java From NLIWOD with GNU Affero General Public License v3.0 | 6 votes |
public static ArrayList<String> loadSystemP(String system){ Path datapath = Paths.get("./src/main/resources/QALD6MultilingualLogs/multilingual_" + system + ".html"); ArrayList<String> result = Lists.newArrayList(); try{ String loadedData = Files.lines(datapath).collect(Collectors.joining()); Document doc = Jsoup.parse(loadedData); Element table = doc.select("table").get(5); Elements tableRows = table.select("tr"); for(Element row: tableRows){ Elements tableEntry = row.select("td"); result.add(tableEntry.get(2).ownText()); } result.remove(0); //remove the head of the table return result; }catch(IOException e){ e.printStackTrace(); log.debug("loading failed."); return result; } }
Example #16
Source File: RssLoader.java From android-opensource-library-56 with Apache License 2.0 | 6 votes |
private void parseCssSelector(Document document) { Elements elements = document.select("item"); for (Element element : elements) { Item item = new Item(); Elements title = element.select("title"); Elements link = element.select("link"); if (!title.isEmpty()) { item.title = title.get(0).text(); } if (!link.isEmpty()) { item.url = link.get(0).text(); } if (mList == null) { mList = new RssList(); } mList.addItem(item); } }
Example #17
Source File: CoderBusyProxyListPageParser.java From ProxyPool with Apache License 2.0 | 6 votes |
@Override public List<Proxy> parse(String content) { Document document = Jsoup.parse(content); Elements elements = document.select("div[class='table-responsive'] table[class='table'] tbody tr"); List<Proxy> proxyList = new ArrayList<>(elements.size()); for (Element element : elements){ String ip = element.select("td:eq(0)").first().text(); String port = element.select("td:eq(2)").first().text(); String type = element.select("td:eq(5)").first().text(); String isAnonymous = element.select("td:eq(7)").first().text(); System.out.println("ip:"+ip); log.debug("parse result = "+type+"://"+ip+":"+port+" "+isAnonymous); if(!anonymousFlag || isAnonymous.contains("匿")){ proxyList.add(new Proxy(ip, Integer.valueOf(port), type, Constant.TIME_INTERVAL)); } } return proxyList; }
Example #18
Source File: BatchFilterTest.java From jinjava with Apache License 2.0 | 6 votes |
@Test public void batchFilterNoBackfill() { Map<String, Object> context = ImmutableMap.of( "items", (Object) Lists.newArrayList("1", "2", "3", "4", "5", "6") ); Document dom = Jsoup.parseBodyFragment(render("batch-filter", context)); assertThat(dom.select("tr")).hasSize(2); Elements trs = dom.select("tr"); assertThat(trs.get(0).select("td")).hasSize(3); assertThat(trs.get(0).select("td").get(0).text()).isEqualTo("1"); assertThat(trs.get(0).select("td").get(1).text()).isEqualTo("2"); assertThat(trs.get(0).select("td").get(2).text()).isEqualTo("3"); assertThat(trs.get(1).select("td")).hasSize(3); assertThat(trs.get(1).select("td").get(0).text()).isEqualTo("4"); assertThat(trs.get(1).select("td").get(1).text()).isEqualTo("5"); assertThat(trs.get(1).select("td").get(2).text()).isEqualTo("6"); }
Example #19
Source File: HtmlUtils.java From TrackRay with GNU General Public License v3.0 | 6 votes |
/** * 提取网页中的表单数据 * @param source 网页源代码 * @return 表单数据 */ public static Map<String, String> extractForm(String source) { Map<String, String> map = new HashMap<>(); if (StringUtils.isNotBlank(source)) { Document doc = Jsoup.parse(source); Elements elements = doc.select("input"); for (Element element : elements) { String key; if (org.apache.commons.lang3.StringUtils.isNotBlank(element.attr("name"))) { key = element.attr("name"); } else if (org.apache.commons.lang3.StringUtils.isNotBlank(element.id())) { key = element.id(); } else { continue; } map.put(key, element.val()); } } return map; }
Example #20
Source File: CDTClassifierEvaluation.java From NLIWOD with GNU Affero General Public License v3.0 | 6 votes |
public static ArrayList<String> loadSystemR(String system){ Path datapath = Paths.get("./src/main/resources/QALD6MultilingualLogs/multilingual_" + system + ".html"); ArrayList<String> result = Lists.newArrayList(); try{ String loadedData = Files.lines(datapath).collect(Collectors.joining()); Document doc = Jsoup.parse(loadedData); Element table = doc.select("table").get(5); Elements tableRows = table.select("tr"); for(Element row: tableRows){ Elements tableEntry = row.select("td"); result.add(tableEntry.get(1).ownText()); } result.remove(0); //remove the head of the table return result; }catch(IOException e){ e.printStackTrace(); log.debug("loading failed."); return result; } }
Example #21
Source File: WebPage.java From zap-extensions with Apache License 2.0 | 6 votes |
/** * Extracts script nodes from web page HTML document * * @param url * @throws IOException */ private void getScriptNodes(URL url) throws IOException { // Document doc = getHTML(url);// this required another connexion Elements scripts = HTMLDoc.select("script"); for (int i = 0; i < scripts.size(); i++) { Element script = scripts.get(i); if (script.hasAttr("src")) { // System.out.println("script = "+scripts.get(i)+""); this.scripts.add(script); } // System.out.println("-----------------------"); } }
Example #22
Source File: WhenJavaExtensionIsRegistered.java From asciidoctorj with Apache License 2.0 | 6 votes |
@Test public void a_block_processor_instance_should_be_executed_when_registered_block_is_found_in_document() { JavaExtensionRegistry javaExtensionRegistry = this.asciidoctor.javaExtensionRegistry(); Map<String, Object> config = new HashMap<>(); config.put(Contexts.KEY, Arrays.asList(Contexts.PARAGRAPH)); config.put(ContentModel.KEY, ContentModel.SIMPLE); YellBlock yellBlock = new YellBlock("yell", config); javaExtensionRegistry.block(yellBlock); String content = asciidoctor.convertFile( classpath.getResource("sample-with-yell-block.ad"), options().toFile(false).get()); org.jsoup.nodes.Document doc = Jsoup.parse(content, "UTF-8"); Elements elements = doc.getElementsByClass("paragraph"); assertThat(elements.size(), is(1)); assertThat(elements.get(0).text(), is("THE TIME IS NOW. GET A MOVE ON.")); }
Example #23
Source File: UtilsStaticAnalyzer.java From apogen with Apache License 2.0 | 6 votes |
/** * Statically analyze the DOM of the State s to create comprehensive Form * objects * * @param s * @return */ public static Set<Form> createFormObjects(State s) { Set<Form> fl = new HashSet<Form>(); String dom = s.getDom(); Document doc = Jsoup.parse(dom, "UTF-8"); Elements forms = doc.getElementsByTag("form"); System.out.println("[LOG]\t" + forms.size() + " form(s) found in " + s.getName()); for (org.jsoup.nodes.Element currentForm : forms) { Form formObject = new Form(); formObject.setAttributes(currentForm.attributes().asList()); formObject.setFormName(getFormName(formObject, currentForm)); formObject.setReturnValue("void"); formObject.setFormFieldList(analyzeFormBody(s, currentForm)); fl.add(formObject); } return fl; }
Example #24
Source File: JsoupCssInliner.java From ogham with Apache License 2.0 | 5 votes |
/** * Replace link tags with style tags in order to keep the same inclusion * order * * @param doc * the html document * @param cssContents * the list of external css files with their content */ private static void internStyles(Document doc, List<ExternalCss> cssContents) { Elements els = doc.select(CSS_LINKS_SELECTOR); for (Element e : els) { if (isInlineModeAllowed(e, InlineModes.STYLE_ATTR)) { String path = e.attr(HREF_ATTR); ExternalCss css = getCss(cssContents, path); if (css != null) { Element style = new Element(Tag.valueOf(STYLE_TAG), ""); style.appendChild(new DataNode(getCssContent(css))); e.replaceWith(style); } } } }
Example #25
Source File: MoverParser.java From Mover with Apache License 2.0 | 5 votes |
public int getLastNavigationPage(Document document){ Elements elements = document.select("div.pagination .digits .ut a"); if(elements.size() > 0){ return internalGetIntegers(elements.last().text()); } return -1; }
Example #26
Source File: LoginHelper.java From hipda with GNU General Public License v2.0 | 5 votes |
private String getFormhash() { String rstStr = null; try { rstStr = OkHttpHelper.getInstance().get(HiUtils.LoginGetFormHash); if (!TextUtils.isEmpty(rstStr)) { Document doc = Jsoup.parse(rstStr); Elements elements = doc.select("input[name=formhash]"); Element element = elements.first(); if (element == null) { Elements alartES = doc.select("div.alert_info"); if (alartES.size() > 0) { mErrorMsg = alartES.first().text(); } else { mErrorMsg = "Can NOT get formhash"; } return ""; } return element.attr("value"); } } catch (Exception e) { mErrorMsg = OkHttpHelper.getErrorMessage(e).getMessage(); } return rstStr; }
Example #27
Source File: ImagebamRipper.java From ripme with MIT License | 5 votes |
/** * Rips useful image from "image page" */ private void fetchImage() { try { Document doc = Http.url(url).get(); // Find image Elements metaTags = doc.getElementsByTag("meta"); String imgsrc = "";//initialize, so no NullPointerExceptions should ever happen. for (Element metaTag: metaTags) { //the direct link to the image seems to always be linked in the <meta> part of the html. if (metaTag.attr("property").equals("og:image")) { imgsrc = metaTag.attr("content"); LOGGER.info("Found URL " + imgsrc); break;//only one (useful) image possible for an "image page". } } //for debug, or something goes wrong. if (imgsrc.isEmpty()) { LOGGER.warn("Image not found at " + this.url); return; } // Provide prefix and let the AbstractRipper "guess" the filename String prefix = ""; if (Utils.getConfigBoolean("download.save_order", true)) { prefix = String.format("%03d_", index); } addURLToDownload(new URL(imgsrc), prefix); } catch (IOException e) { LOGGER.error("[!] Exception while loading/parsing " + this.url, e); } }
Example #28
Source File: PCAGrader.java From MtgDesktopCompanion with GNU General Public License v3.0 | 5 votes |
@Override public Grading loadGrading(String identifier) throws IOException { String url=getWebSite()+"/resumeBdd/"+identifier+"/1"; Document d = RequestBuilder.build().method(METHOD.GET) .setClient(URLTools.newClient()) .url(url) .toHtml(); Elements els = d.select("li.mb-1"); if(els.isEmpty()) { logger.debug(identifier +" is not found for " + getName()); return null; } els.get(3).select("strong").remove(); els.get(5).select("strong").remove(); Grading g = new Grading(); g.setGraderName(getName()); g.setNumberID(identifier); g.setGradeNote(Double.parseDouble(els.get(3).text())); try { g.setGradeDate(new SimpleDateFormat("yyyy").parse(els.get(5).text())); } catch (ParseException e) { logger.error(e); } g.setUrlInfo(url); return g; }
Example #29
Source File: FUN_CSSPath.java From sparql-generate with Apache License 2.0 | 5 votes |
private NodeValue selectAttribute(Element element, String selectPath, String attributeName) { Elements elements = element.select(selectPath); Element e = elements.first(); if (e == null) { throw new ExprEvalException("No evaluation of " + element + ", " + selectPath); } if (!e.hasAttr(attributeName)) { throw new ExprEvalException("The evaluation of " + element + ", " + selectPath + " is an element that does not have attribute " + attributeName); } return new NodeValueString(e.attr(attributeName)); }
Example #30
Source File: HtmlParserTest.java From astor with GNU General Public License v2.0 | 5 votes |
@Test public void handlesUnknownTags() { String h = "<div><foo title=bar>Hello<foo title=qux>there</foo></div>"; Document doc = Jsoup.parse(h); Elements foos = doc.select("foo"); assertEquals(2, foos.size()); assertEquals("bar", foos.first().attr("title")); assertEquals("qux", foos.last().attr("title")); assertEquals("there", foos.last().text()); }