org.jsoup.nodes.Element Java Examples

The following examples show how to use org.jsoup.nodes.Element. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JsoupSuperSubClassListExtractor.java    From wandora with GNU General Public License v3.0 8 votes vote down vote up
private void parseTopic(Element classElement, Topic classTopic) throws TopicMapException {
    String name = classElement.text().trim();
    
    if(name.length() == 0) return;
    
    Topic t = getOrCreateTopic(tm, null , name);
    
    if(classTopic == null) classTopic = wandoraClass;
    makeSubclassOf(tm, t, classTopic);
    
    // See if the next element is a list (of instances)
    Element listWrapper = classElement.nextElementSibling();
    if(listWrapper != null && !listWrapper.children().isEmpty()) {
        for(Element listCandidate: listWrapper.children()){
            if(listCandidate.tagName().equals("ul"))
                parseList(listCandidate, t);
        }
    }
}
 
Example #2
Source File: OutputFormatter.java    From JumpGo with Mozilla Public License 2.0 6 votes vote down vote up
private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
    for (Node child : e.childNodes()) {
        if (unlikely(child)) {
            continue;
        }
        if (child instanceof TextNode) {
            TextNode textNode = (TextNode) child;
            String txt = textNode.text();
            accum.append(txt);
        } else if (child instanceof Element) {
            Element element = (Element) child;
            if (accum.length() > 0 && element.isBlock()
                    && !lastCharIsWhitespace(accum))
                accum.append(' ');
            else if (element.tagName().equals("br"))
                accum.append(' ');
            appendTextSkipHidden(element, accum, indent + 1);
        }
    }
}
 
Example #3
Source File: RssLoader.java    From android-opensource-library-56 with Apache License 2.0 6 votes vote down vote up
private void parseCssSelector(Document document) {
    Elements elements = document.select("item");
    for (Element element : elements) {
        Item item = new Item();
        Elements title = element.select("title");
        Elements link = element.select("link");
        if (!title.isEmpty()) {
            item.title = title.get(0).text();
        }
        if (!link.isEmpty()) {
            item.url = link.get(0).text();
        }
        if (mList == null) {
            mList = new RssList();
        }
        mList.addItem(item);
    }
}
 
Example #4
Source File: ZenPostModel.java    From zen4android with MIT License 6 votes vote down vote up
public void UpdateSign() {
	try {
		mTime = "";
		mSign = "";
		String token = ZenUtils.getToken();
		if (token != null) {
			ZenURLConnection connection = new ZenURLConnection(ZEN_UPDATE_SIGN_URL + mFid);
			connection.addRequestHeader("Cookie", "u=" + URLEncoder.encode(token, "utf-8") + ";");
			connection.addRequestHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0");
			String response = connection.startSychronous();
			if (response != null) {
				Document doc = Jsoup.parse(response);
				
				Element signEle = doc.select("input[name=sign]").first();
				mSign = signEle.attr("value");
				Element timeEle = doc.select("input[name=time]").first();
				mTime = timeEle.attr("value");
				
				System.out.println("time: " + mTime + " sign: " + mSign);
			}
		}

	} catch (Exception e) {
		e.printStackTrace();
	}
}
 
Example #5
Source File: M2DocHTMLParser.java    From M2Doc with Eclipse Public License 1.0 6 votes vote down vote up
/**
 * Sets the unordered list numbering.
 * 
 * @param context
 *            the {@link Context}
 * @param element
 *            the ol {@link Element}
 */
private void setUnorderedListNumbering(Context context, Element element) {
    final String symbol;
    if (element.hasAttr(TYPE_ATTR)) {
        final String type = element.attr(TYPE_ATTR);
        if ("disc".equals(type)) {
            symbol = DISC_SYMBOL;
        } else if ("square".equals(type)) {
            symbol = SQUARE_SYMBOL;
        } else if ("circle".equals(type)) {
            symbol = CIRCLE_SYMBOL;
        } else {
            symbol = DISC_SYMBOL;
        }
    } else {
        symbol = DISC_SYMBOL;
    }

    if (context.numbering == null) {
        createNumbering(context);
    }
    context.numberingLevel = incrementNumberingLevel(context.numbering, context.numberingLevel,
            STNumberFormat.BULLET, 1, symbol, false);
}
 
Example #6
Source File: Aw22Rule05081.java    From Asqatasun with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * 
 * @param sspHandler
 * @param elementHandler 
 * @param elementHandlerWithoutDataTableMarkup
 */
private void extractTableWithDataTableMarkup(
            ElementHandler<Element> elementHandler, 
            ElementHandler<Element> elementHandlerWithoutDataTableMarkup) {
    
    Elements elementsWithMarkup = new Elements();
    
    for (Element el : elementHandler.get()) {
        if (el.select(DATA_TABLE_MARKUP_CSS_LIKE_QUERY).size() > 0) {
            elementsWithMarkup.add(el);
        } else if (elementHandlerWithoutDataTableMarkup != null) {
            elementHandlerWithoutDataTableMarkup.add(el);
        }
    }
    elementHandler.clean().addAll(elementsWithMarkup);
}
 
Example #7
Source File: NcepHtmlScraper.java    From netcdf-java with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private void writeTableAXml(String name, String source, String filename, List<Stuff> stuff) throws IOException {
  org.jdom2.Element rootElem = new org.jdom2.Element("tableA");
  org.jdom2.Document doc = new org.jdom2.Document(rootElem);
  rootElem.addContent(new org.jdom2.Element("title").setText(name));
  rootElem.addContent(new org.jdom2.Element("source").setText(source));

  for (Stuff p : stuff) {
    org.jdom2.Element paramElem = new org.jdom2.Element("parameter");
    paramElem.setAttribute("code", Integer.toString(p.no));
    paramElem.addContent(new org.jdom2.Element("description").setText(p.desc));
    rootElem.addContent(paramElem);
  }

  XMLOutputter fmt = new XMLOutputter(Format.getPrettyFormat());
  String x = fmt.outputString(doc);

  try (FileOutputStream fout = new FileOutputStream(dirOut + filename)) {
    fout.write(x.getBytes(StandardCharsets.UTF_8));
  }

  if (show)
    System.out.printf("%s%n", x);
}
 
Example #8
Source File: FuckBroDomain.java    From TrackRay with GNU General Public License v3.0 6 votes vote down vote up
public Map<String,String> aizhanIcp(String domain){
    HashMap<String, String> map = new HashMap<>();
    HttpClient httpClient = new HttpClient();
    String url = "https://icp.aizhan.com/%s/";
    try {
        ResponseStatus responseStatus = httpClient.get(String.format(url, domain));
        String html = responseStatus.getContent();
        if (!html.contains("未找到") && html.contains("该单位备案网站") && html.contains("缓存于"))
        {
            Document doc = Jsoup.parse(html);

            Elements trs = doc.select("div#company .table-s1 tbody tr");
            for (Element tr : trs) {
                String title = tr.select("td").get(1).text();
                String dom = tr.select("td").get(2).text();
                map.put(dom,title);
            }
        }
    } catch (Exception e) {
        task.getExceptions().add(e);
    }
    SysLog.info("ICP反查结束");
    return map;
}
 
Example #9
Source File: M66ipProxyListPageParser.java    From ProxyPool with Apache License 2.0 6 votes vote down vote up
@Override
public List<Proxy> parse(String html) {
    Document document = Jsoup.parse(html);
    Elements elements = document.select("table tr:gt(1)");
    List<Proxy> proxyList = new ArrayList<>(elements.size());
    for (Element element : elements){
        String ip = element.select("td:eq(0)").first().text();
        String port  = element.select("td:eq(1)").first().text();
        String isAnonymous = element.select("td:eq(3)").first().text();
        log.debug("parse result = http://"+ip+":"+port+"  "+isAnonymous);
        if(!anonymousFlag || isAnonymous.contains("匿")){
            proxyList.add(new Proxy(ip, Integer.valueOf(port), "http", Constant.TIME_INTERVAL));
        }
    }
    return proxyList;
}
 
Example #10
Source File: ArticleTextExtractor.java    From Xndroid with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Weights current element. By matching it with positive candidates and
 * weighting child nodes. Since it's impossible to predict which exactly
 * names, ids or class names will be used in HTML, major role is played by
 * child nodes
 *
 * @param e Element to weight, along with child nodes
 */
private int getWeight(Element e, boolean checkextra) {
    int weight = calcWeight(e);
    int ownTextWeight = (int) Math.round(e.ownText().length() / 100.0 * 10);
    weight += ownTextWeight;
    int childrenWeight = weightChildNodes(e);
    weight += childrenWeight;

    // add additional weight using possible 'extragravityscore' attribute
    if (checkextra) {
        Element xelem = e.select("[extragravityscore]").first();
        if (xelem != null) {
            //                System.out.println("HERE found one: " + xelem.toString());
            weight += Integer.parseInt(xelem.attr("extragravityscore"));
            //                System.out.println("WITH WEIGHT: " + xelem.attr("extragravityscore"));
        }
    }

    return weight;
}
 
Example #11
Source File: Rgaa30Rule060304.java    From Asqatasun with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
protected void check(
        SSPHandler sspHandler, 
        TestSolutionHandler testSolutionHandler) {
    if (getLinkElementSelector().isEmpty()) {
        testSolutionHandler.addTestSolution(TestSolution.NOT_APPLICABLE);
        return;
    }
    
    prs = sspHandler.getProcessRemarkService();
    setServicesToChecker(titlePertinenceElementChecker);
    
    if (! getLinkElementSelector().getDecidableElements().isEmpty()) {
        setServicesToChecker(getDecidableElementsChecker());
        for (Element el : getLinkElementSelector().getDecidableElements().get()) {
            testSolutionHandler.addTestSolution(testLink(sspHandler, el));
        }
    }
    // reset service and aggregate all the remarks collected locally
    // for further save
    prs.resetService();
    prs.getRemarkList().addAll(remarks);
}
 
Example #12
Source File: Rgaa32016Rule100102.java    From Asqatasun with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
protected void select(SSPHandler sspHandler) {
    totalNumberOfElements = sspHandler.getTotalNumberOfElements();
    // retrieve element from the nomenclature
    Nomenclature deprecatedHtmlAttr = nomenclatureLoaderService.
            loadByCode(PRESENTATION_ATTR_NOM);
    for (String deprecatedAttr : deprecatedHtmlAttr.getValueList()) {
        SimpleElementSelector sec = 
                    new SimpleElementSelector(buildQuery(deprecatedAttr));
        ElementHandler<Element> eh = new ElementHandlerImpl();
        sec.selectElements(sspHandler, eh);
        
        attrElementHandlerMap.put(deprecatedAttr, eh);
    }   
    
    // elements with width attribute that are not img
    SimpleElementSelector secWidthAttrNotImg = 
            new SimpleElementSelector(ELEMENT_WITH_WITDH_ATTR_NOT_IMG_V2);
    ElementHandler<Element> ehWithAttrNotImg = new ElementHandlerImpl();
    secWidthAttrNotImg.selectElements(sspHandler, ehWithAttrNotImg);
        
    attrElementHandlerMap.put(WIDTH_ATTR, ehWithAttrNotImg);
    
    // elements with width attribute that are not img
    SimpleElementSelector secHeightAttrNotImg = 
            new SimpleElementSelector(ELEMENT_WITH_HEIGHT_ATTR_NOT_IMG_V2);
    ElementHandler<Element> ehHeightAttrNotImg = new ElementHandlerImpl();
    secHeightAttrNotImg.selectElements(sspHandler, ehHeightAttrNotImg);
        
    attrElementHandlerMap.put(HEIGHT_ATTR, ehHeightAttrNotImg);
}
 
Example #13
Source File: ComicextraRipper.java    From ripme with MIT License 5 votes vote down vote up
@Override
protected Document getFirstPage() throws IOException {
    Document doc = null;

    switch (urlType) {
        case COMIC:
            // For COMIC type url we extract the urls of each chapters and store them in chapters.
            chaptersList = new ArrayList<>();
            Document comicPage = Http.url(url).get();
            Elements elements = comicPage.select("div.episode-list a");
            for (Element e : elements) {
                chaptersList.add(getCompleteChapterUrl(e.attr("abs:href")));
            }

            // Set the first chapter from the chapterList as the doc.                
            chapterIndex = 0;
            doc = Http.url(chaptersList.get(chapterIndex)).get();
            break;
        case CHAPTER:
            doc = Http.url(url).get();
            break;
        case UNKNOWN:
        default:
            throw new IOException("Unknown url type encountered.");
    }

    return doc;
}
 
Example #14
Source File: MarkupUtilsTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Test
public void testAddAttribute() {
  Element e = new Element(Tag.valueOf("p"), "");
  MarkupUtils.addAttribute(e, "key", "value1");

  assert (MarkupUtils.getAttribute(e, "key").contains("value1"));

  MarkupUtils.addAttribute(e, "key", "value2");

  assert (MarkupUtils.getAttribute(e, "key").contains("value1"));
  assert (MarkupUtils.getAttribute(e, "key").contains("value2"));

  assert (MarkupUtils.getAttributes(e, "key").contains("value1"));
  assert (MarkupUtils.getAttributes(e, "key").contains("value2"));
}
 
Example #15
Source File: StatusDataSetter.java    From Simpler with Apache License 2.0 5 votes vote down vote up
@Override
public void onItemPhotoLinkListener(View widget, String url) {
    if (TextUtils.isEmpty(BaseConfig.sAccount.cookie)) {
        // 获取Cookie
        mActivity.startActivity(WebWBActivity.newIntent(mActivity, true));
        AppToast.showToast(R.string.get_web_wb_cookie);
    } else {
        HttpGetTask httpTask = new HttpGetTask(true, new HttpListener() {
            @Override
            public void onResponse(String response) {
                if (TextUtils.isEmpty(response)) {
                    // Cookie过期,重新获取Cookie
                    AppToast.showToast(R.string.update_web_wb_cookie);
                    mActivity.startActivity(WebWBActivity.newIntent(mActivity, true));
                } else {
                    try {
                        Document doc = Jsoup.parse(response);
                        Element element = doc.select("img[src]").get(0);
                        String img = element.attr("src");
                        Intent intent = PhotoViewActivity.newIntent(mActivity, img);
                        mActivity.startActivity(intent);
                    } catch (Exception ex) {
                        ex.printStackTrace();
                        AppToast.showToast(R.string.update_web_wb_cookie);
                        mActivity.startActivity(WebWBActivity.newIntent(mActivity, true));
                    }
                }
            }

            @Override
            public void onFailure() {
                AppToast.showToast(R.string.cannot_view_pic);
            }
        });
        httpTask.execute(url, BaseConfig.sAccount.cookie);
        ((BaseActivity) mActivity).registerAsyncTask(((BaseActivity) mActivity).getClass(), httpTask);
    }
}
 
Example #16
Source File: HtmlParserTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void preservesSpaceInScript() {
    // preserve because it's content is a data node
    Document doc = Jsoup.parse("<script>\nOne\n\tTwo\n\tThree\n</script>");
    String expect = "\nOne\n\tTwo\n\tThree\n";
    Element el = doc.select("script").first();
    assertEquals(expect, el.data());
    assertEquals("One\n\tTwo\n\tThree", el.html());
    assertEquals("<script>" + expect + "</script>", el.outerHtml());
}
 
Example #17
Source File: StackOverflowParser.java    From SnowGraph with Apache License 2.0 5 votes vote down vote up
private static List<CodeInfo> parseHTMLNodeToParagraphs(Node node) {
	List<CodeInfo> paragraphList = new ArrayList<>();
	List<Node> childNodes = node.childNodes();
	for (Node childNode : childNodes) {
		if (childNode.nodeName().equals("p") || childNode.nodeName().equals("li")) continue;
		if (childNode.nodeName().equals("pre"))
			childNode.childNodes().stream()
					.filter(n -> n.nodeName().equals("code"))
					.map(n -> new CodeInfo(StringEscapeUtils.unescapeHtml4(((Element) n).text())))
					.forEach(paragraphList::add);
		else paragraphList.addAll(parseHTMLNodeToParagraphs(childNode));
	}
	return paragraphList;
}
 
Example #18
Source File: Elements.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Add the supplied HTML to the start of each matched element's inner HTML.
 * @param html HTML to add inside each element, before the existing HTML
 * @return this, for chaining
 * @see Element#prepend(String)
 */
public Elements prepend(String html) {
    for (Element element : this) {
        element.prepend(html);
    }
    return this;
}
 
Example #19
Source File: Elements.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Add the supplied HTML to the end of each matched element's inner HTML.
 * @param html HTML to add inside each element, after the existing HTML
 * @return this, for chaining
 * @see Element#append(String)
 */
public Elements append(String html) {
    for (Element element : this) {
        element.append(html);
    }
    return this;
}
 
Example #20
Source File: Elements.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Get the combined inner HTML of all matched elements.
 * @return string of all element's inner HTML.
 * @see #text()
 * @see #outerHtml()
 */
public String html() {
    StringBuilder sb = new StringBuilder();
    for (Element element : this) {
        if (sb.length() != 0)
            sb.append("\n");
        sb.append(element.html());
    }
    return sb.toString();
}
 
Example #21
Source File: HtmlNode.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
protected List<String> getSourceTexts() {
    List<String> sourceTexts = new ArrayList<String>(getElements().size());
    for (Element element : getElements()) {
        sourceTexts.add(element.toString());
    }
    return sourceTexts;
}
 
Example #22
Source File: IMDBHTMLMovieParser.java    From J-Kinopoisk2IMDB with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
protected Function<Element, Movie> getDataMapper() {
    return e -> new Movie(
            parseTitle(e),
            parseYear(e.text()),
            parseType(e.text()),
            null,
            parseIMDBId(e.getElementsByTag("a").first())
    );
}
 
Example #23
Source File: AppsGamesCatalogApi.java    From 4pdaClient-plus with Apache License 2.0 5 votes vote down vote up
public static ArrayList<Topic> loadCategoryThemes(IHttpClient client, String catalogId) throws IOException {
    String pageBody = client.performGet(GAMES_CATALOG_URL).getResponseBody();
    ArrayList<Topic> res = new ArrayList<>();

    Pattern pattern = Pattern.compile("<a name=\"entry" + catalogId + "\">([\\s\\S]*?)</div>(?:<!--Begin Msg Number|<!-- TABLE FOOTER)", Pattern.CASE_INSENSITIVE);
    Matcher m = pattern.matcher(pageBody);

    if (!m.find())
        return res;
    Document doc = Jsoup.parse(m.group(1));
    Elements subCategoryElements = doc.select("ol[type=1]>li");
    for (Element topicElement : subCategoryElements) {
        Elements elements = topicElement.select("a");
        if (elements.size() == 0) continue;

        Element element = elements.get(0);
        Uri uri = Uri.parse(element.attr("href"));

        Topic topic = new Topic(uri.getQueryParameter("showtopic"), element.text());
        m = Pattern.compile("</a>(?:\\s*</b>\\s*-\\s*)(.*)?(?:<br\\s*/>|$)", Pattern.CASE_INSENSITIVE).matcher(topicElement.html());
        if (m.find())
            topic.setDescription(m.group(1));
        res.add(topic);
    }

    return res;
}
 
Example #24
Source File: EncodingDetect.java    From HaoReader with GNU General Public License v3.0 5 votes vote down vote up
public static String getHtmlEncode(@NonNull byte[] bytes) {
    try {
        Document doc = Jsoup.parse(new String(bytes, StandardCharsets.UTF_8));
        Elements metaTags = doc.getElementsByTag("meta");
        String charsetStr;
        for (Element metaTag : metaTags) {
            charsetStr = metaTag.attr("charset");
            if (!isEmpty(charsetStr)) {
                return charsetStr;
            }
            String content = metaTag.attr("content");
            String http_equiv = metaTag.attr("http-equiv");
            if (http_equiv.toLowerCase().equals("content-type")) {
                if (content.toLowerCase().contains("charset")) {
                    charsetStr = content.substring(content.toLowerCase().indexOf("charset") + "charset=".length());
                } else {
                    charsetStr = content.substring(content.toLowerCase().indexOf(";") + 1);
                }
                if (!isEmpty(charsetStr)) {
                    return charsetStr;
                }
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    return getJavaEncode(bytes);
}
 
Example #25
Source File: Elements.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Remove an attribute from every matched element.
 * @param attributeKey The attribute to remove.
 * @return this (for chaining)
 */
public Elements removeAttr(String attributeKey) {
    for (Element element : this) {
        element.removeAttr(attributeKey);
    }
    return this;
}
 
Example #26
Source File: ImgboxRipper.java    From ripme with MIT License 5 votes vote down vote up
@Override
public List<String> getURLsFromPage(Document doc) {
    List<String> imageURLs = new ArrayList<>();
    for (Element thumb : doc.select("div.boxed-content > a > img")) {
        String image = thumb.attr("src").replaceAll("thumbs", "images");
        image = image.replace("_b", "_o");
        image = image.replaceAll("\\d-s", "i");
        imageURLs.add(image);
    }
    return imageURLs;
}
 
Example #27
Source File: DataAttributeMapperTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Test
public void testElementWithoutTypes() {
  Element e = new Element(Tag.valueOf("p"), "");
  mapper.map(jCas, e, collector);

  assertNull(collector.getAnnotations());
}
 
Example #28
Source File: BootstrapHandler.java    From flow with Apache License 2.0 5 votes vote down vote up
private Element createDependencyElement(BootstrapContext context,
        JsonObject dependencyJson) {
    String type = dependencyJson.getString(Dependency.KEY_TYPE);
    if (Dependency.Type.contains(type)) {
        Dependency.Type dependencyType = Dependency.Type.valueOf(type);
        return createDependencyElement(context.getUriResolver(),
                LoadMode.INLINE, dependencyJson, dependencyType);
    }
    return Jsoup.parse(
            dependencyJson.getString(Dependency.KEY_CONTENTS), "",
            Parser.xmlParser());
}
 
Example #29
Source File: ApiTest.java    From MyBlogDemo with Apache License 2.0 5 votes vote down vote up
@Test
    public void testJsoup3() throws IOException {
        Document document = Jsoup.connect("https://android-arsenal.com/search?q=Circle").get();
//        Elements select = document.select("div.container.content");
        Elements select = document.select("div.pc");
        int i = 0;
        for (Element element : select) {
            System.out.println(element.toString());
            i++;
            System.out.println("---------------------------------------");
            if (i == 3) {
                return;
            }
        }
    }
 
Example #30
Source File: TemplateDataAnalyzer.java    From flow with Apache License 2.0 5 votes vote down vote up
private boolean isInsideTemplate(org.jsoup.nodes.Element element,
        org.jsoup.nodes.Element templateRoot) {
    if (element == templateRoot) {
        return false;
    }
    if ("template".equalsIgnoreCase(element.tagName())) {
        return true;
    }
    return isInsideTemplate(element.parent(), templateRoot);
}