Java Code Examples for org.jsoup.nodes.Element#html()

The following examples show how to use org.jsoup.nodes.Element#html() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Rgaa3Extractor.java    From Asqatasun with GNU Affero General Public License v3.0 6 votes vote down vote up
private static void extractLevelFromCriterionAndWrite(Document doc) throws IOException {
    StringBuilder crit = new StringBuilder();
    for (Element el : doc.select(CRITERION_SELECTOR)) {
        if (StringUtils.isNotBlank(el.id())) {
            crit.append(el.id().replace("crit", "Rgaa30"));
            crit.append("=");
            String content = el.html();
            content = content.substring(content.indexOf("] ") + 1);
            content = extractRuleContent(content);
            crit.append(content);
            crit.append("\n");
            String level = el.text().substring(el.text().indexOf("[")+1, el.text().indexOf("]"));
            levelFromCrit.put(el.id().replaceAll("crit-", ""), level);
        }
    }
    if (writeCritInFile) {
        FileUtils.write(new File(CRITERION_I18N_FILE_PATH), crit.toString());
    }
}
 
Example 2
Source File: RosiMM.java    From PicKing with Apache License 2.0 6 votes vote down vote up
@Override
public String getContentNext(String baseUrl, String currentUrl, byte[] result) throws UnsupportedEncodingException {
    Document document = Jsoup.parse(new String(result, "gbk"));
    Elements elements = document.select("script");
    for (Element element : elements) {
        String code = element.html();
        if (!element.html().equals("")) {
            Pattern pattern = Pattern.compile("index_\\d*.htm\">下一页");
            Matcher matcher = pattern.matcher(code);
            if (matcher.find()) {
                String temp = matcher.group();
                return baseUrl + "rosimm/" + temp.substring(0, temp.length() - 5);
            }
        }
    }
    return "";
}
 
Example 3
Source File: GithubHotProcessor.java    From hot-crawler with MIT License 6 votes vote down vote up
@Override
protected Info getInfoByElement(Element element) {
    Element urlElement = element.getElementsByTag("h1").get(0).getElementsByTag("a").get(0);
    Element descElement = null;
    if (! element.getElementsByTag("p").isEmpty()) {
        descElement = element.getElementsByTag("p").get(0);
    }
    String repositoryName = urlElement.attr("href");
    // Title
    StringBuilder infoTitle = new StringBuilder();
    infoTitle.append(repositoryName.substring(repositoryName.indexOf('/', 1) + 1));
    infoTitle.append(". ");
    String desc = descElement == null ? "" : descElement.html();
    infoTitle.append(desc);
    // Url
    StringBuilder infoUrl = new StringBuilder();
    infoUrl.append(this.prefix);
    infoUrl.append(repositoryName);
    return new Info(infoTitle.toString(), infoUrl.toString());
}
 
Example 4
Source File: WeiboHotProcessor.java    From hot-crawler with MIT License 6 votes vote down vote up
@Override
protected List<Info> getInfoDataByElements(Elements elements) {
    List<Info> list = new ArrayList<>();
    if (elements != null) {
        // remove two tr elements
        elements.remove(0);
        elements.remove(0);
        int i = 0;
        for (Element element : elements) {
            Element itemElement = element.getElementsByClass("td-02").get(0).getElementsByTag("a").get(0);
            String id = String.valueOf(++i);
            String infoUrl = itemElement.attr("href");
            String infoTitle = itemElement.html();
            infoUrl = this.prefix + infoUrl;
            list.add(new Info(id, infoTitle, infoUrl));
        }
    }
    return list;
}
 
Example 5
Source File: CloudmusicHotProcessor.java    From hot-crawler with MIT License 6 votes vote down vote up
@Override
protected List<Info> getInfoDataByElements(Elements elements) {
    List<Info> list = new ArrayList<>();
    if (elements != null) {
        int i = 0;
        for (Element element : elements) {
            Element itemElement = null;
            try {
                itemElement = element.getElementsByTag("a").get(0);
                String id = String.valueOf(++i);
                StringBuilder infoUrl = new StringBuilder();
                infoUrl.append(this.prefix);
                infoUrl.append("#");
                infoUrl.append(itemElement.attr("href"));
                String infoTitle = itemElement.html();
                list.add(new Info(id, infoTitle, infoUrl.toString()));
            } catch (NullPointerException | IndexOutOfBoundsException e) {
                log.error("Can't found item element by attribute!", e);
            }
        }
    }
    return list;
}
 
Example 6
Source File: TestSession.java    From actframework with Apache License 2.0 6 votes vote down vote up
private static boolean matches(Object a, Object b) {
    if ($.eq(a, b)) {
        return true;
    }
    if (!((b instanceof String) && (a instanceof Element))) {
        return false;
    }
    String test = S.string(b);
    Element element = (Element) a;
    // try html
    String html = element.html();
    if (S.eq(html, test, S.IGNORECASE)) {
        return true;
    }
    // try text
    String text = element.text();
    if (S.eq(text, test, S.IGNORECASE)) {
        return true;
    }
    // try val
    String val = element.val();
    if (S.eq(val, test, S.IGNORECASE)) {
        return true;
    }
    return false;
}
 
Example 7
Source File: RedgifsRipper.java    From ripme with MIT License 6 votes vote down vote up
@Override
public List<String> getURLsFromPage(Document doc) {
    List<String> result = new ArrayList<>();
    if (isProfile().matches() || isSearch().matches()) {
        result = hasURLs(doc);
    } else {
        Elements videos = doc.select("script");
        for (Element el : videos) {
            String json = el.html();
            if (json.startsWith("{")) {
                JSONObject page = new JSONObject(json);
                result.add(page.getJSONObject("video").getString("contentUrl"));
            }
        }
    }
    return result;
}
 
Example 8
Source File: JSVarFieldRender.java    From gecco with MIT License 5 votes vote down vote up
@Override
@SuppressWarnings({ "unchecked" })
public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) {
	Context cx = Context.enter();
	ScriptableObject scope = cx.initSafeStandardObjects();
	String windowScript = "var window = {};var document = {};";
	cx.evaluateString(scope, windowScript, "window", 1, null);
	HtmlParser parser = new HtmlParser(request.getUrl(), response.getContent());
	for (Element ele : parser.$("script")) {
		String sc = ele.html();
		if (StringUtils.isNotEmpty(sc)) {
			try {
				cx.evaluateString(scope, sc, "", 1, null);
			} catch (Exception ex) {
				// ex.printStackTrace();
			}
		}
	}
	Map<String, Object> fieldMap = new HashMap<String, Object>();
	Set<Field> jsVarFields = ReflectionUtils.getAllFields(bean.getClass(), ReflectionUtils.withAnnotation(JSVar.class));
	for (Field jsVarField : jsVarFields) {
		Object value = injectJsVarField(request, beanMap, jsVarField, cx, scope);
		if(value != null) {
			fieldMap.put(jsVarField.getName(), value);
		}
	}
	beanMap.putAll(fieldMap);
	Context.exit();
}
 
Example 9
Source File: TianLaiReadUtil.java    From MissZzzReader with Apache License 2.0 5 votes vote down vote up
/**
 * 从html中获取章节列表
 *
 * @param html
 * @return
 */
public static ArrayList<Chapter> getChaptersFromHtml(String html,Book book) {
    ArrayList<Chapter> chapters = new ArrayList<>();
    Document doc = Jsoup.parse(html);
    Element divList = doc.getElementById("list");
    Element dl = divList.getElementsByTag("dl").get(0);

    String lastTile = null;
    int i = 0;
    for(Element dd : dl.getElementsByTag("dd")){
        Elements as = dd.getElementsByTag("a");
        if (as.size() > 0) {
            Element a = as.get(0);
            String title = a.html();
            if (!StringHelper.isEmpty(lastTile) && title.equals(lastTile)) {
                continue;
            }
            Chapter chapter = new Chapter();
            chapter.setNumber(i++);
            chapter.setTitle(title);
            String url = a.attr("href");
            if (StringHelper.isEmpty(book.getSource()) || BookSource.tianlai.toString().equals(book.getSource())) {
                url = URLCONST.nameSpace_tianlai + url;
            } else if (BookSource.biquge.toString().equals(book.getSource())) {
                url = book.getChapterUrl() + url;
            }
            chapter.setUrl(url);
            chapters.add(chapter);
            lastTile = title;
        }

    }

    return chapters;
}
 
Example 10
Source File: IfanrHotProcessor.java    From hot-crawler with MIT License 5 votes vote down vote up
@Override
protected Info getInfoByElement(Element element) {
    element = element.getElementsByClass("js-title-transform").get(0);
    String infoUrl = element.attr("href");
    String infoTitle = element.html();
    return new Info(infoTitle, infoUrl);
}
 
Example 11
Source File: HuxiuHotProcessor.java    From hot-crawler with MIT License 5 votes vote down vote up
@Override
protected Info getInfoByElement(Element element) {
    Element titleItem = element.getElementsByClass("article-item__content__title").get(0);
    String infoTitle = titleItem.html();
    Element urlItem = element.getElementsByClass("article-item__img").get(0).parent();
    StringBuilder infoUrl = new StringBuilder(this.prefix);
    infoUrl.append(urlItem.attr("href"));
    Info info = new Info();
    info.setTitle(infoTitle);
    info.setUrl(infoUrl.toString());
    return info;
}
 
Example 12
Source File: ImageExtensions.java    From Android-WYSIWYG-Editor with Apache License 2.0 5 votes vote down vote up
public void loadImage(String _path, Element node) {
    String desc = null;
    if(node != null) {
        desc = node.html();
    }
    final View childLayout = loadImageRemote(_path, desc);
    CustomEditText text = childLayout.findViewById(R.id.desc);
    if(node != null) {
        componentsWrapper.getInputExtensions().applyStyles(text, node);
    }
}
 
Example 13
Source File: HtmlField.java    From jspoon with MIT License 5 votes vote down vote up
private <U> String getValue(Element node, Class<U> fieldType) {
    if (node == null) {
        return spec.getDefaultValue();
    }
    String value;
    switch (spec.getAttribute()) {
    case "":
    case "text":
        value = node.text();
        break;
    case "html":
    case "innerHtml":
        value = node.html();
        break;
    case "outerHtml":
        value = node.outerHtml();
        break;
    default:
        value = node.attr(spec.getAttribute());
        break;
    }
    if (spec.getRegex() != null) {
        Pattern pattern = Pattern.compile(spec.getRegex());
        Matcher matcher = pattern.matcher(value);
        if (matcher.find()) {
            value = (matcher.groupCount() > 0) ? matcher.group(1) : spec.getDefaultValue();
            if (value == null || value.isEmpty()) {
                value = spec.getDefaultValue();
            }
        }
    }
    return value;
}
 
Example 14
Source File: InputExtensions.java    From Android-WYSIWYG-Editor with Apache License 2.0 5 votes vote down vote up
@Override
public Node buildNodeFromHTML(Element element) {
    String text;
    int count;
    TextView tv;
    HtmlTag tag = HtmlTag.valueOf(element.tagName().toLowerCase());
    switch (tag){
        case h1:
        case h2:
        case h3:
            RenderHeader(tag, element);
            break;
        case p:
        case div:
            text = element.html();
            count = editorCore.getParentView().getChildCount();
            tv = insertEditText(count, null, text);
            applyStyles(tv, element);
            break;
        case blockquote:
            text = element.html();
            count = editorCore.getParentView().getChildCount();
            tv = insertEditText(count, null, text);
            UpdateTextStyle(EditorTextStyle.BLOCKQUOTE,tv);
            applyStyles(tv, element);
    }
    return null;
}
 
Example 15
Source File: ModifyHTMLElement.java    From localization_nifi with Apache License 2.0 4 votes vote down vote up
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    final FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }

    final Document doc;
    final Elements eles;
    try {
        doc = parseHTMLDocumentFromFlowfile(flowFile, context, session);
        eles = doc.select(context.getProperty(CSS_SELECTOR).evaluateAttributeExpressions(flowFile).getValue());
    } catch (Exception ex) {
        getLogger().error("Failed to extract HTML from {} due to {}; routing to {}", new Object[] {flowFile, ex.toString(), REL_INVALID_HTML.getName()}, ex);
        session.transfer(flowFile, REL_INVALID_HTML);
        return;
    }

    final String modifiedValue = context.getProperty(MODIFIED_VALUE).evaluateAttributeExpressions(flowFile).getValue();

    if (eles == null || eles.size() == 0) {
        // No element found
        session.transfer(flowFile, REL_NOT_FOUND);
    } else {
        for (Element ele : eles) {
            switch (context.getProperty(OUTPUT_TYPE).getValue()) {
                case ELEMENT_HTML:
                    ele.html(modifiedValue);
                    break;
                case ELEMENT_ATTRIBUTE:
                    ele.attr(context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions(flowFile).getValue(), modifiedValue);
                    break;
                case ELEMENT_TEXT:
                    ele.text(modifiedValue);
                    break;
            }
        }

        FlowFile ff = session.write(session.create(flowFile), new StreamCallback() {
            @Override
            public void process(InputStream in, OutputStream out) throws IOException {
                out.write(doc.html().getBytes(StandardCharsets.UTF_8));
            }
        });
        ff = session.putAttribute(ff, NUM_ELEMENTS_MODIFIED_ATTR, new Integer(eles.size()).toString());
        session.transfer(ff, REL_SUCCESS);

        // Transfer the original HTML
        session.transfer(flowFile, REL_ORIGINAL);
    }
}
 
Example 16
Source File: WebComponentBootstrapHandler.java    From flow with Apache License 2.0 4 votes vote down vote up
/**
 * Copies the {@link org.jsoup.nodes.Element Elements} found in the given
 * {@code head} elements into the head of the embedding website using
 * JavaScript. Drops {@code <base>} element.
 *
 * @param contentType
 *            Content type of the response.
 * @param response
 *            {@link com.vaadin.flow.server.VaadinResponse} into which the
 *            script is written
 * @param head
 *            head element of Vaadin Bootstrap page. The child elements are
 *            copied into the embedding page's head using JavaScript.
 * @param serviceUrl
 *            base path to use for the head elements' URLs
 * @throws IOException
 *             if writing fails
 */
protected void writeBootstrapPage(String contentType,
        VaadinResponse response, Element head, String serviceUrl)
        throws IOException {
    /*
     * The elements found in the head are reconstructed using JavaScript and
     * document.createElement(...). Since innerHTML and related methods do
     * not execute <script> blocks, the contents cannot be copied as pure
     * string into the head. The each element is created separately and then
     * attributes are copied and innerHTML set, if the element has
     * innerHTML. The innerHTMLs are in-lined for easier copying.
     */
    response.setContentType(contentType);
    /*
     * Collection of Elements that should be transferred to the web
     * component shadow DOMs rather than the page head
     */
    ArrayList<com.vaadin.flow.dom.Element> elementsForShadows = new ArrayList<>();
    try (BufferedWriter writer = new BufferedWriter(
            new OutputStreamWriter(response.getOutputStream(), UTF_8))) {
        String varName = "headElem"; // generated head element
        writer.append("var ").append(varName).append("=null;");
        for (Element element : head.children()) {
            if (elementShouldNotBeTransferred(element)) {
                getElementForShadowDom(element)
                        .ifPresent(elementsForShadows::add);
                continue;
            }
            writer.append(varName).append("=");
            writer.append("document.createElement('")
                    .append(element.tagName()).append("');");
            transferAttribute(writer, varName, element, serviceUrl);
            // set cleaned html as innerHTML for the element
            String elementHtml = element.html();
            if (elementHtml != null && elementHtml.length() > 0) {
                writer.append(varName).append(".innerHTML=\"")
                        .append(inlineHTML(elementHtml)).append("\";");
            }
            writer.append("document.head.appendChild(").append(varName)
                    .append(");");
        }
    }

    WebComponentConfigurationRegistry
            .getInstance(response.getService().getContext())
            .setShadowDomElements(elementsForShadows);
}
 
Example 17
Source File: JsoupUtil.java    From materialup with Apache License 2.0 4 votes vote down vote up
private static String html(Element e) {
    if (e == null) {
        return null;
    }
    return e.html();
}
 
Example 18
Source File: SubHDCommon.java    From SubTitleSearcher with Apache License 2.0 4 votes vote down vote up
/**
 * 获取下载网址列表
 * @return
 */
public static JSONArray getDetailList(String url) {
	String result = HtHttpUtil.http.get(baseUrl+url, HtHttpUtil.http.default_charset, HtHttpUtil.http._ua, baseUrl+url);
	Document doc = Jsoup.parse(result);
	Elements matchList = doc.select(".d_table tr");
	//System.out.println(matchList.html());
	JSONArray detailList = new JSONArray();
	for (Element matchRow : matchList) {
		if(matchRow.select(".dt_edition").size() == 0)continue;
		String html = matchRow.html();
		String htmlLower = html.toLowerCase();
		String downUrl = matchRow.select(".dt_down a").attr("href");
		String title = matchRow.select(".dt_edition a").text().trim();
		int downCount = Integer.valueOf(RegexUtil.getMatchStr(matchRow.select(".dt_count").text(), "([\\d]+)"));
		String ext = "";
		for(String extName : AppConfig.subExtNames) {
			//if(StrUtil.isNotEmpty(RegexUtil.getMatchStr(html, "(>"+extName+"<)", Pattern.CASE_INSENSITIVE))) {
			if(htmlLower.contains(">"+extName+"<")) {
				ext += extName;
				ext += ",";
			}
		}
		if(ext.endsWith(",")) {
			ext=ext.substring(0, ext.length()-1);
		}else {
			ext="其它";
		}
		
		String lang = "";
		String[] langList = new String[] {"双语", "简体", "繁体", "英文"};
		for(String langName : langList) {
			if(htmlLower.contains(">"+langName+"<")) {
				lang += langName;
				lang += ",";
			}
		}
		if(lang.endsWith(",")) {
			lang=lang.substring(0, lang.length()-1);
		}else {
			lang="其它";
		}
		
		Elements labels = matchRow.select(".label");
		StringBuffer labelInfo = new StringBuffer();
		labels.forEach(element ->{
			labelInfo.append(element.text() + ",");
		});
		if(labelInfo.length() > 0) {
			labelInfo.delete(labelInfo.length()-1, labelInfo.length());
		}
		String zimuzu = matchRow.select("a.gray").text();
		
		JSONObject dataRow = new JSONObject();
		dataRow.put("url", downUrl);
		dataRow.put("title", title);
		dataRow.put("ext", ext);
		dataRow.put("lang",lang);
		dataRow.put("rate", "-");
		dataRow.put("downCount", downCount);
		dataRow.put("labelInfo", labelInfo);
		dataRow.put("zimuzu", zimuzu);
		detailList.add(dataRow);
	}
	return detailList;
}
 
Example 19
Source File: ParsePxgav.java    From v9porn with MIT License 4 votes vote down vote up
/**
 * @param html 原网页
 * @return json===
 */
public static BaseResult<PxgavVideoParserJsonResult> parserVideoUrl(String html) {
    BaseResult<PxgavVideoParserJsonResult> baseResult = new BaseResult<>();
    Document document = Jsoup.parse(html);
    Element videoWrapper = document.getElementsByClass("penci-entry-content entry-content").first();
    String videoHtml = videoWrapper.html();
    Logger.t(TAG).d(videoHtml);
    int index = videoHtml.indexOf("setup") + 6;
    int endIndexV = videoHtml.indexOf(");");
    String videoUrl = videoHtml.substring(index, endIndexV);
    Logger.t(TAG).d(videoUrl);

    PxgavVideoParserJsonResult pxgavVideoParserJsonResult = new Gson().fromJson(videoUrl, PxgavVideoParserJsonResult.class);

    Elements items = document.getElementsByClass("penci-block_content").first().select("article");
    List<PxgavModel> pxgavModelList = new ArrayList<>();
    for (Element element : items) {
        PxgavModel pxgavModel = new PxgavModel();
        Element a = element.selectFirst("a");
        String title = a.attr("title");
        pxgavModel.setTitle(title);
        String contentUrl = a.attr("href");
        pxgavModel.setContentUrl(contentUrl);
        String imgUrl = a.attr("style");

        String bigImg = StringUtils.subString(imgUrl, imgUrl.indexOf("url(") + 4, imgUrl.lastIndexOf("-"));
        Logger.t(TAG).d(bigImg);
        if (TextUtils.isEmpty(bigImg)) {
            pxgavModel.setImgUrl(imgUrl);
        } else {
            pxgavModel.setImgUrl(bigImg + ".jpg");
        }
        int beginIndex = bigImg.lastIndexOf("/");
        int endIndex = bigImg.lastIndexOf("-");
        String pId = StringUtils.subString(imgUrl, beginIndex + 1, endIndex);
        //Logger.t(TAG).d(pId);
        pxgavModel.setpId(pId);

        pxgavModelList.add(pxgavModel);
    }
    pxgavVideoParserJsonResult.setPxgavModelList(pxgavModelList);
    baseResult.setData(pxgavVideoParserJsonResult);
    return baseResult;
}
 
Example 20
Source File: ParsePxgav.java    From v9porn with MIT License 4 votes vote down vote up
/**
 * @param html 原网页
 * @return json===
 */
public static BaseResult<PxgavVideoParserJsonResult> parserVideoUrl(String html) {
    BaseResult<PxgavVideoParserJsonResult> baseResult = new BaseResult<>();
    Document document = Jsoup.parse(html);
    Element videoWrapper = document.getElementsByClass("penci-entry-content entry-content").first();
    String videoHtml = videoWrapper.html();
    Logger.t(TAG).d(videoHtml);
    int index = videoHtml.indexOf("setup") + 6;
    int endIndexV = videoHtml.indexOf(");");
    String videoUrl = videoHtml.substring(index, endIndexV);
    Logger.t(TAG).d(videoUrl);

    PxgavVideoParserJsonResult pxgavVideoParserJsonResult = new Gson().fromJson(videoUrl, PxgavVideoParserJsonResult.class);

    Elements items = document.getElementsByClass("penci-block_content").first().select("article");
    List<PxgavModel> pxgavModelList = new ArrayList<>();
    for (Element element : items) {
        PxgavModel pxgavModel = new PxgavModel();
        Element a = element.selectFirst("a");
        String title = a.attr("title");
        pxgavModel.setTitle(title);
        String contentUrl = a.attr("href");
        pxgavModel.setContentUrl(contentUrl);
        String imgUrl = a.attr("style");

        String bigImg = StringUtils.subString(imgUrl, imgUrl.indexOf("url(") + 4, imgUrl.lastIndexOf("-"));
        Logger.t(TAG).d(bigImg);
        if (TextUtils.isEmpty(bigImg)) {
            pxgavModel.setImgUrl(imgUrl);
        } else {
            pxgavModel.setImgUrl(bigImg + ".jpg");
        }
        int beginIndex = bigImg.lastIndexOf("/");
        int endIndex = bigImg.lastIndexOf("-");
        String pId = StringUtils.subString(imgUrl, beginIndex + 1, endIndex);
        //Logger.t(TAG).d(pId);
        pxgavModel.setpId(pId);

        pxgavModelList.add(pxgavModel);
    }
    pxgavVideoParserJsonResult.setPxgavModelList(pxgavModelList);
    baseResult.setData(pxgavVideoParserJsonResult);
    return baseResult;
}