Java Code Examples for org.jsoup.select.Elements#remove()

The following examples show how to use org.jsoup.select.Elements#remove() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: CSSReverter.java From BlogManagePlatform with Apache License 2.0

6 votes

/**
 * 将html中外联的css变成内联,并去掉外联样式
 * @author Frodez
 * @date 2019-03-21
 */
@Override
public String revert(String html) {
	Assert.notNull(html, "html must not be null");
	try {
		Document document = Jsoup.parse(html);
		Elements links = document.select("link[href]");
		Elements htmlElement = document.select("html");
		for (Element iter : links) {
			String path = iter.attr("href");
			if (!path.endsWith(".css")) {
				continue;
			}
			htmlElement.prepend(StrUtil.concat("<style type=\"text/css\">", FileUtil.readString(ResourceUtils
				.getFile(StrUtil.concat(FreemarkerRender.getLoaderPath(), path))), "</style>"));
		}
		links.remove();
		return document.html();
	} catch (Exception e) {
		log.error("[frodez.util.renderer.reverter.CSSReverter.revert]", e);
		return html;
	}
}

Example 2

Source File: WeiboHotProcessor.java From hot-crawler with MIT License

6 votes

@Override
protected List<Info> getInfoDataByElements(Elements elements) {
    List<Info> list = new ArrayList<>();
    if (elements != null) {
        // remove two tr elements
        elements.remove(0);
        elements.remove(0);
        int i = 0;
        for (Element element : elements) {
            Element itemElement = element.getElementsByClass("td-02").get(0).getElementsByTag("a").get(0);
            String id = String.valueOf(++i);
            String infoUrl = itemElement.attr("href");
            String infoTitle = itemElement.html();
            infoUrl = this.prefix + infoUrl;
            list.add(new Info(id, infoTitle, infoUrl));
        }
    }
    return list;
}

Example 3

Source File: MTGoldFishDashBoard.java From MtgDesktopCompanion with GNU General Public License v3.0

5 votes

@Override
public List<CardDominance> getBestCards(MagicFormat.FORMATS f, String filter) throws IOException {

	// spells, creatures, all, lands
	
	String u = getString(WEBSITE) + "/format-staples/" + f.name().toLowerCase() + "/full/" + filter;
	
	if(f == MagicFormat.FORMATS.COMMANDER)
		u=getString(WEBSITE) + "/format-staples/commander_1v1/full/" + filter;
	
	Document doc = URLTools.extractHtml(u);

	logger.debug("get best cards : " + u);
	Elements trs = doc.select("table tr");
	trs.remove(0);
	trs.remove(0);
	List<CardDominance> ret = new ArrayList<>();
	for (Element e : trs) {
		Elements tds = e.select(MTGConstants.HTML_TAG_TD);
		try {
			int correct = filter.equalsIgnoreCase("lands") ? 1 : 0;

			CardDominance d = new CardDominance();
			d.setPosition(Integer.parseInt(tds.get(0).text()));
			d.setCardName(tds.get(1).text());
			d.setDecksPercent(Double.parseDouble(tds.get(3 - correct).text().replaceAll("\\%", "")));
			d.setPlayers(Double.parseDouble(tds.get(4 - correct).text().replaceAll("\\%", "")));
			
			ret.add(d);
		} catch (Exception ex) {
			logger.error("Error parsing " + tds, ex);
		}

	}
	return ret;
}

Example 4

Source File: TagServlet.java From firing-range with Apache License 2.0

5 votes

@Override
public void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException {
  if (request.getParameter("q") == null) {
    Responses.sendError(response, "Missing q parameter", 400);
    return;
  }

  String  q = request.getParameter("q");
  Document doc = Jsoup.parseBodyFragment(q);
  Element body = doc.body();
  Elements elements = body.getAllElements();
  if (!(q.contains("body"))){
    elements.remove(body);
  }

  if (elements.isEmpty()) {
    Responses.sendError(response, "Invalid input, no tags", 400);
    return;
  }

  String allowedTag = "";
  String allowedAttribute = "";
  if (request.getPathInfo() != null) {
    String pathInfo = request.getPathInfo().substring(1);
    if (pathInfo.contains("/")) {
      allowedTag = pathInfo.split("/", 2)[0];
      allowedAttribute = pathInfo.split("/")[1];
    } else {
      allowedTag = pathInfo;
    }      
  }
  handleRequest(elements, response, allowedTag, allowedAttribute);
}

Example 5

Source File: DescendantSelector.java From JsoupXpath with Apache License 2.0

5 votes

@Override
public XValue apply(Elements context) {
    Set<Element> total = new HashSet<>();
    Elements descendant = new Elements();
    for (Element el:context){
        Elements tmp = el.getAllElements();
        //exclude self
        tmp.remove(el);
        total.addAll(tmp);
    }
    descendant.addAll(total);
    return XValue.create(descendant);
}

Example 6

Source File: StructuralHtml.java From baleen with Apache License 2.0

5 votes

@Override
protected void writeBody(final JCas jCas, final Element body) {

  final Node<Structure> root = StructureHierarchy.build(jCas, structuralClasses).getRoot();

  walk(body, root);

  // We need to create the proper li tags under ol and ul
  body.select("ul > p").wrap("<li></li>");
  body.select("ol > p").wrap("<li></li>");

  // Correct table cells from td to th in header
  body.select("thead td").tagName("th");

  // Add &nbsp; to any empty td or th's
  body.select("td:empty,th:empty").html("&nbsp");

  if (!outputEmptyTags) {
    Elements e = emptyElements(body);
    while (!e.isEmpty()) {
      e.remove();
      e = emptyElements(body);
    }
  }

  // TODO: In accordance with HTML spec
  // - Captions for Table should be moved inside the table
  // - Captions for Figure should be moved inside the figure

}

Example 7

Source File: RemoveEmptyText.java From baleen with Apache License 2.0

5 votes

private boolean removeEmpty(Element document) {
  Elements emptyNodes = document.select(":empty").not("body");
  if (emptyNodes.isEmpty()) {
    return true;
  }
  emptyNodes.remove();
  return false;
}

Example 8

Source File: CourseParse.java From CourseScheduleDemo with MIT License

5 votes

public static List<Course> parsePersonal(String data){
    List<Course> courses = new ArrayList<>();
    Document doc = Jsoup.parse(data);
    //首先获取Table
    Element table = doc.getElementById("Table1");
    //然后获取table中的td节点
    Elements trs = table.select("tr");
    //移除不需要的参数，这里表示移除前两个数值。
    trs.remove(0);
    trs.remove(0);
    //遍历td节点
    for (int i=0; i<trs.size(); ++i){
        Element tr = trs.get(i);
        //获取tr下的td节点，要求
        Elements tds = tr.select("td[align]");
        //遍历td节点
        for(int j=0; j<tds.size(); ++j){
            Element td = tds.get(j);
            String str = td.text();
            //如果数值为空则不计算。
            if (str.length() != 1){
                //解析文本数据
                str = parsePersonalCourse(str);
                Course course = new Course();
                course.setClsName(str);
                course.setDay(j+1);
                course.setClsCount(Integer.valueOf(td.attr("rowspan")));
                course.setClsNum(i+1);
                Random random = new Random();
                int num = random.nextInt(COLOR.length);
                course.setColor(COLOR[num]);
                courses.add(course);
            }
        }
    }
    return courses;
}

Example 9

Source File: Comic.java From HHComicViewer with Apache License 2.0

4 votes

public boolean checkUpdate(String content) {
        //查看是否有更新;
        //获取到网页内容时自动完善内容
        Document doc = Jsoup.parse(content);
        Element comicInfoDiv = doc.select("div[id=permalink]").first();

        this.title = comicInfoDiv.getElementsByTag("h1").first().text();

        Element about_kit = comicInfoDiv.select("div[id=about_kit]").first();
        Elements comicInfoList = about_kit.select("li");
        comicInfoList.remove(0);
        for (Element comicInfo : comicInfoList) {
            switch (comicInfo.getElementsByTag("b").first().text()) {
                case "作者:":
                    this.author = comicInfo.text().split(":")[1];
                    break;
                case "状态:":
                    this.comicStatus = comicInfo.text();
                    break;
                case "集数:":
                    this.comicStatus += (" " + comicInfo.text().split("\\)")[0] + ")");
                    break;
                case "更新:":
                    this.comicUpdateTime = comicInfo.text();
                    break;
                case "收藏:":
                    this.comicFavorite = comicInfo.text();
                    break;
                case "评价:":
                    this.ratingNumber = Float.valueOf(comicInfo.getElementsByTag("span").first().text());
                    this.ratingPeopleNum = Integer.valueOf(comicInfo.text().split("\\(")[1].split("人")[0]);
                    break;
                case "简介":
                    this.description = comicInfo.text();
                    break;
            }
        }

        //章节目录解析
        Element volListSrc = doc.select("div[class=cVolList]").first();
        Elements tagsSrc = volListSrc.select("div[class=cVolTag]");
        Elements tagchapterSrc = volListSrc.select("ul[class=cVolUl]");

        this.chapterName = new ArrayList<>();
        this.chapterId = new ArrayList<>();
        for (int i = 0; i < tagsSrc.size(); i++) {
//            this.tags.add(tagsSrc.get(i).text());
            Elements chaptersSrc = tagchapterSrc.get(i).select("a[class=l_s]");
//            tagCounts.add(chaptersSrc.size());
            for (int j = chaptersSrc.size() - 1; j > -1; j--) {
                //这个倒数循环把原本的倒序的章节顺序变为正序
                chapterName.add(chaptersSrc.get(j).attr("title"));
                //地址需要做一个变换，因为需要另外一个网站的网址，更好解析
                String urlSrc = chaptersSrc.get(j).attr("href");
                //图片服务器编号
                String domainNum = urlSrc.split("=")[1];
                //章节编号
                String chapterNum = urlSrc.split("/")[1].substring(4);
                chapterId.add(Long.parseLong(chapterNum));
                if (i == 0) {
                    serverId = Integer.parseInt(domainNum);
                }
            }
        }
        if (this.chapterCount != this.chapterName.size()) {
            this.isUpdate = true;
        }
        this.chapterCount = this.chapterName.size();
        return isUpdate;
    }

Example 10

Source File: EchoMTGDashBoard.java From MtgDesktopCompanion with GNU General Public License v3.0

4 votes

@Override
protected EditionsShakers getOnlineShakesForEdition(MagicEdition ed) throws IOException {
	
	EditionsShakers variations = new EditionsShakers();
	variations.setDate(new Date());
	variations.setEdition(ed);
	variations.setProviderName(getName());
	
	Document d = RequestBuilder.build().method(METHOD.GET).setClient(client)
	 .url(EchoMTGExport.BASE_URL+"/set/"+ed.getId().toUpperCase()+"/"+ed.getSet().replace(" ", "-").toLowerCase()+"/")
	 .addHeader(URLTools.HOST, WEBSITE)
	 .addHeader(URLTools.REFERER, EchoMTGExport.BASE_URL)
	 .toHtml();
	
	
	Elements trs = d.select("table#set-table tr");
	trs.remove(trs.first());
	trs.remove(trs.last());
	
	trs.forEach(tr->{
		
		Elements tds = tr.getElementsByTag("td");
		CardShake cs = new CardShake();
				  cs.setEd(ed.getId());
				  cs.setName(tds.get(2).getElementsByTag("a").first().text());
			  
				  double price =Double.parseDouble(tds.get(4).getElementsByTag("a").first().attr("data-price"));
				  double lastWeekPrice = price;
				  
				  if(!tds.get(3).text().isEmpty())
				  {
					  double pc = Double.parseDouble(tds.get(3).text().replace("%",""))/100;
					  lastWeekPrice = price - (lastWeekPrice*pc);
				  }
				  cs.init(price, price, lastWeekPrice);
				  
				  
				  
				  
				  cs.setCurrency(getCurrency());
		variations.addShake(cs);
	});
	return variations;
}

Example 11

Source File: Comic.java From HHComicViewer with Apache License 2.0

4 votes

public Comic(int cid, String content) {
        this.cid = cid;
        //获取到网页内容时自动完善内容
        Document doc = Jsoup.parse(content);
        Element comicInfoDiv = doc.select("div[class=product]").first();

        this.title = comicInfoDiv.getElementsByTag("h1").first().text();
        this.thumbnailUrl = comicInfoDiv.select("div[id=about_style]").first()
                .getElementsByTag("img").first().attr("src");

        Element about_kit = comicInfoDiv.select("div[id=about_kit]").first();
        Elements comicInfoList = about_kit.select("li");
        comicInfoList.remove(0);
        for (Element comicInfo : comicInfoList) {
            switch (comicInfo.getElementsByTag("b").first().text()) {
                case "作者:":
                    this.author = comicInfo.text().split(":")[1];
                    break;
                case "状态:":
                    this.comicStatus = comicInfo.text();
                    break;
                case "集数:":
                    this.comicStatus += (" " + comicInfo.text().split("\\)")[0] + ")");
                    break;
                case "更新:":
                    this.comicUpdateTime = comicInfo.text();
                    break;
                case "收藏:":
                    this.comicFavorite = comicInfo.text();
                    break;
                case "评价:":
                    this.ratingNumber = Float.valueOf(comicInfo.getElementsByTag("span").first().text());
                    this.ratingPeopleNum = Integer.valueOf(comicInfo.text().split("\\(")[1].split("人")[0]);
                    break;
                case "简介":
                    this.description = comicInfo.text();
                    break;
            }
        }

        //章节目录解析
        Element volListSrc = doc.select("div[class=cVolList]").first();
        Elements tagsSrc = volListSrc.select("div[class=cVolTag]");
        Elements tagchapterSrc = volListSrc.select("ul[class=cVolUl]");

        this.chapterName = new ArrayList<>();
        this.chapterId = new ArrayList<>();
        for (int i = 0; i < tagsSrc.size(); i++) {
//            this.tags.add(tagsSrc.get(i).text());
            Elements chaptersSrc = tagchapterSrc.get(i).select("a[class=l_s]");
//            tagCounts.add(chaptersSrc.size());
            for (int j = chaptersSrc.size() - 1; j > -1; j--) {
                //这个倒数循环把原本的倒序的章节顺序变为正序
                chapterName.add(chaptersSrc.get(j).attr("title"));
                //地址需要做一个变换，因为需要另外一个网站的网址，更好解析
                String urlSrc = chaptersSrc.get(j).attr("href");
                //图片服务器编号
                String domainNum = urlSrc.split("=")[1];
                //章节编号
                String chapterNum = urlSrc.split("/")[1].substring(4);
                chapterId.add(Long.parseLong(chapterNum));
                if (i == 0) {
                    serverId = Integer.parseInt(domainNum);
                }
            }
        }
        this.chapterCount = this.chapterName.size();
    }

Example 12

Source File: MagicBazarShopper.java From MtgDesktopCompanion with GNU General Public License v3.0

4 votes

private List<OrderEntry> parse(Document doc, String id, Date date) {
	List<OrderEntry> entries = new ArrayList<>();
	Elements table = doc.select("div.table div.tr");
	table.remove(0);
	
	
	for(int i=0;i<table.size();i++)
	{
		Element e = table.get(i);
		boolean iscard=e.hasClass("filterElement");
		String name = e.select("div.td.name").text();
		
		
		if(!name.isEmpty())
		{

			OrderEntry entrie = new OrderEntry();
				entrie.setIdTransation(id);
				entrie.setSource(getName());
				entrie.setCurrency(Currency.getInstance("EUR"));
				entrie.setSeller(getName());
				entrie.setTypeTransaction(TYPE_TRANSACTION.BUY);
				entrie.setTransactionDate(date);
				entrie.setDescription(name);
				if(iscard)
				{
					entrie.setType(TYPE_ITEM.CARD);
					entrie.setDescription(e.select("div.td.name.name_mobile").text());
					entrie.setItemPrice(UITools.parseDouble(e.attr("attribute_price")));
					String set = e.select("div.td.ext img").attr("title");
					try {
						
						entrie.setEdition(MTGControler.getInstance().getEnabled(MTGCardsProvider.class).getSetByName(set));
					}
					catch(Exception ex)
					{
						logger.error(set + " is not found");
					}
					
					
				}
				else
				{
					String price =e.select("div.new_price").html().replaceAll("&nbsp;"+Currency.getInstance("EUR").getSymbol(), "").trim(); 
					entrie.setItemPrice(UITools.parseDouble(price));
					if(entrie.getDescription().contains("Set")||entrie.getDescription().toLowerCase().contains("collection"))
						entrie.setType(TYPE_ITEM.FULLSET);
					else if(entrie.getDescription().toLowerCase().contains("booster"))
						entrie.setType(TYPE_ITEM.BOOSTER);
					else if(entrie.getDescription().toLowerCase().startsWith("boite de") || entrie.getDescription().contains("Display") )
						entrie.setType(TYPE_ITEM.BOX);
					else
						entrie.setType(TYPE_ITEM.LOTS);
				}
				notify(entrie);
				entries.add(entrie);	
		}
		
		
		
	}
	
	
	
	return entries;
}

Example 13

Source File: BookClass.java From nju-lib-downloader with GNU General Public License v3.0

4 votes

private Set<Book> queryBooks(Elements booksliNode) {
    Set<Book> books = new HashSet<>();
    for (Element element : booksliNode) {
        //获取书名和id
        String name = null, id = null, author = null, publishDate = null, theme = null, detailBookClass = null;
        BookClass bookBookClass;
        Elements nameIdNode = element.select("p[class=name]");
        if (nameIdNode != null) {
            name = nameIdNode.text();
            Elements idNode = nameIdNode.select("a[onclick]");
            if (idNode != null && idNode.size() > 0) {
                String idOnClick = idNode.get(0).attr("onclick");
                int start = idOnClick.indexOf("(") + 1, end = idOnClick.lastIndexOf(",");
                if (start != 0 && end != -1) {
                    id = idOnClick.substring(start, end);
                }
            }
        }
        //获取分类
        BookClass[] bookClasses = new BookClass[0];
        Elements infoNode = element.select("p[class=info]");
        if (infoNode != null) {
            Elements bookInfos = infoNode.select("a");
            if (bookInfos != null && bookInfos.size() > 0) {
                Element terminalCataNode = bookInfos.last();
                bookInfos.remove(terminalCataNode);
                List<BookClass> tmplist = bookInfos.stream()
                        .map(bookInfo -> getBookCata(bookInfo, false))
                        .filter(Objects::nonNull)
                        .collect(Collectors.toList());
                BookClass terminalBookClass = getBookCata(terminalCataNode, true);
                if (terminalBookClass != null) {
                    tmplist.add(terminalBookClass);
                }
                bookClasses = tmplist.toArray(bookClasses);
            }
        }
        bookBookClass = new RootBookClass().link(bookClasses);

        //获取作者，出版日期，主题词，分类
        String info = element.text();
        Pattern pattern = Pattern.compile("\\d+\\. (.*) 作者[:：](.*) 出版日期[:：](\\d+).*?(?:主题词[:：](.+))? 分类[:：](.*)");
        Matcher matcher = pattern.matcher(info);
        while (matcher.find()) {
            name = matcher.group(1);
            author = matcher.group(2);
            publishDate = matcher.group(3);
            theme = matcher.group(4);
            detailBookClass = matcher.group(5);
        }
        Pattern minPattern = Pattern.compile(".*(《.*》).*");
        Matcher minMatcher = minPattern.matcher(info);
        while (minMatcher.find()) {
            name = minMatcher.group(1);
        }

        //汇总书本
        if (name != null && id != null) {
            Book book = new Book(id, name, author, publishDate, theme, bookBookClass, detailBookClass);
            book.setCookie(cookie);
            books.add(book);
            if (bookBookClass.isTerminal()) {
                ((TerminalBookClass) bookBookClass).addBook(book);
            } else {
                System.out.println("未获取到分类信息，将不被归档 " + book);
            }
        } else {
            System.out.println("error: " + info);
        }
    }
    return books;
}

Example 14

Source File: BookClass.java From nju-lib-downloader with GNU General Public License v3.0

4 votes

private Set<Book> queryBooks(Elements booksliNode) {
    Set<Book> books = new HashSet<>();
    for (Element element : booksliNode) {
        //获取书名和id
        String name = null, id = null, author = null, publishDate = null, theme = null, detailBookClass = null;
        BookClass bookBookClass;
        Elements nameIdNode = element.select("p[class=name]");
        if (nameIdNode != null) {
            name = nameIdNode.text();
            Elements idNode = nameIdNode.select("a[onclick]");
            if (idNode != null && idNode.size() > 0) {
                String idOnClick = idNode.get(0).attr("onclick");
                int start = idOnClick.indexOf("(") + 1, end = idOnClick.lastIndexOf(",");
                if (start != 0 && end != -1) {
                    id = idOnClick.substring(start, end);
                }
            }
        }
        //获取分类
        BookClass[] bookClasses = new BookClass[0];
        Elements infoNode = element.select("p[class=info]");
        if (infoNode != null) {
            Elements bookInfos = infoNode.select("a");
            if (bookInfos != null && bookInfos.size() > 0) {
                Element terminalCataNode = bookInfos.last();
                bookInfos.remove(terminalCataNode);
                List<BookClass> tmplist = bookInfos.stream()
                        .map(bookInfo -> getBookCata(bookInfo, false))
                        .filter(Objects::nonNull)
                        .collect(Collectors.toList());
                BookClass terminalBookClass = getBookCata(terminalCataNode, true);
                if (terminalBookClass != null) {
                    tmplist.add(terminalBookClass);
                }
                bookClasses = tmplist.toArray(bookClasses);
            }
        }
        bookBookClass = this.link(bookClasses);

        //获取作者，出版日期，主题词，分类
        String info = element.text();
        Pattern pattern = Pattern.compile("\\d+\\. (.*) 作者[:：](.*) 出版日期[:：](\\d+).*?(?:主题词[:：](.+))? 分类[:：](.*)");
        Matcher matcher = pattern.matcher(info);
        while (matcher.find()) {
            name = matcher.group(1);
            author = matcher.group(2);
            publishDate = matcher.group(3);
            theme = matcher.group(4);
            detailBookClass = matcher.group(5);
        }
        Pattern minPattern = Pattern.compile(".*(《.*》).*");
        Matcher minMatcher = minPattern.matcher(info);
        while (minMatcher.find()) {
            name = minMatcher.group(1);
        }

        //汇总书本
        if (name != null && id != null) {
            Book book = new Book(id, name, author, publishDate, theme, bookBookClass, detailBookClass);
            book.setCookie(cookie);
            books.add(book);
            if (bookBookClass.isTerminal()) {
                ((TerminalBookClass) bookBookClass).addBook(book);
            } else {
                System.out.println("未获取到分类信息，将不被归档 " + book);
            }
        } else {
            System.out.println("error: " + info);
        }
    }
    return books;
}

Example 15

Source File: JsoupHelper.java From seed with Apache License 2.0

4 votes

/**
 * 抓取天涯论坛帖子内容
 * 目前只抓取楼主发言部分，且内容会存储到用户桌面的文章URL同名txt文件中
 * @param bbsURL      帖子地址（支持传入首页地址或本帖其它任意页面的地址）
 * @param finalPageNo 帖子的最大的页码（如传入页码超出实际最大页码，这里在抓取完最大页码内容后，会自动停止作业）
 */
private static void getTianyaBBSTxt(String bbsURL, int finalPageNo) throws IOException {
    String txt;
    String author;
    String publishTime;
    Element atlInfo;
    Elements elements;
    Document document;
    //去掉URL中的参数
    bbsURL = bbsURL.endsWith("shtml") ? bbsURL : bbsURL.substring(0, bbsURL.indexOf(".shtml")+6);
    //计算待写入的txt文件，并预先清空里面的内容（如果已存在）
    String filePath = FileSystemView.getFileSystemView().getHomeDirectory().getAbsolutePath();
    String fileName = bbsURL.substring(bbsURL.indexOf("post")).replace(".shtml", ".txt");
    File bbsFile = new File(filePath, fileName);
    FileUtils.writeStringToFile(bbsFile, "", StandardCharsets.UTF_8);
    //获取帖子的起始页码
    int pageNo = Integer.parseInt(bbsURL.substring(bbsURL.lastIndexOf("-")+1, bbsURL.lastIndexOf(".")));
    //开始处理所有页面的所有楼层
    for(int i=pageNo; i<finalPageNo; i++){
        if(i == 1){
            /*
             * 单独处理首层楼（首层楼只存在于首页）
             */
            document = Jsoup.connect(bbsURL).get();
            //读取作者和发布时间
            atlInfo = document.getElementById("post_head").select("div.atl-info").first();
            author = atlInfo.select("span").eq(0).select("a").first().text();
            publishTime = atlInfo.select("span").eq(1).text();
            //获取楼层内容：每一个<div class="atl-item"></div>都代表一个楼层，首层也不例外
            elements = document.getElementsByClass("atl-item");
            //楼层具体内容都是在<div class="bbs-content"></div>里面包着的
            txt = elements.first().select("div.bbs-content").html().replaceAll("<br>", "");
            //写入txt
            FileUtils.writeStringToFile(bbsFile, "楼主："+author+"，"+publishTime+"\r\n", StandardCharsets.UTF_8, true);
            FileUtils.writeStringToFile(bbsFile, txt+"\r\n\r\n", StandardCharsets.UTF_8, true);
            //需要移除已处理过的首层楼
            elements.remove(0);
        }else{
            /*
             * 对于非首页的帖子，每次都需重新计算URL，并重新抓取内容
             */
            bbsURL = bbsURL.replace("-"+(i-1)+".shtml", "-"+i+".shtml");
            document = Jsoup.connect(bbsURL).get();
            //超出帖子最终页码的访问，会被天涯重定向到最终页码页
            if(!bbsURL.equals(document.location())){
                System.out.println("帖子抓取完毕");
                return;
            }
            //得到本页需要抓取的elements
            elements = document.getElementsByClass("atl-item");
        }
        /*
         * 上面两种条件，最终都是计算好本页需要迭代处理的elements
         */
        for(Element obj: elements){
            atlInfo = obj.select("div.atl-info").first();
            String authorType = atlInfo.select("strong.host").text();
            //作者类型为空就说明，该楼层非楼主发言，暂时不写入txt
            if(StringUtils.isNotBlank(authorType)){
                txt = obj.select("div.bbs-content").html().replaceAll("<br>", "");
                author = atlInfo.select("span").eq(0).select("a").first().text();
                publishTime = atlInfo.select("span").eq(1).text();
                FileUtils.writeStringToFile(bbsFile, authorType+"："+author+"，"+publishTime+"\r\n", StandardCharsets.UTF_8, true);
                FileUtils.writeStringToFile(bbsFile, txt+"\r\n\r\n", StandardCharsets.UTF_8, true);
            }
        }
    }
}

Example 16

Source File: Expression.java From firing-range with Apache License 2.0

4 votes

@Override
public void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException {
  if (request.getParameter("q") == null) {
    Responses.sendError(response, "Missing q parameter", 400);
    return;
  }

  String  q = request.getParameter("q");
  Document doc = Jsoup.parseBodyFragment(q);
  Element body = doc.body();
  Elements elements = body.getAllElements();
  elements.remove(body);
  if (elements.isEmpty()) {
    Responses.sendError(response, "Invalid input, no tags", 400);
    return;
  }

  StringBuilder res = new StringBuilder();
  for (Element element : elements) {
    boolean validElement = true;

    Attributes attributes = element.attributes();
    for (Attribute attribute : attributes) {
      if (attribute.getKey().toLowerCase().startsWith("on")
          || attribute.getKey().toLowerCase().equals("href")
          || attribute.getKey().toLowerCase().equals("src")) {
        validElement = false;
      }

      if (attribute.getKey().toLowerCase().equals("style")
          && attribute.getValue().toLowerCase().contains("expression")) {
        validElement = false;
      }
    }

    if (validElement) {
      res.append(element.toString());
    }
  }
  Responses.sendXssed(response, res.toString());
}

Example 17

Source File: TtsHelper.java From coolreader with MIT License

4 votes

private void removeAllChildren(Element el, Elements elements) {
	for (Element child : el.getAllElements()) {
		elements.remove(child);
	}
}

Example 18

Source File: MTGGradeGrader.java From MtgDesktopCompanion with GNU General Public License v3.0

3 votes

@Override
public Grading loadGrading(String identifier) throws IOException {
	
	String url=getWebSite()+"/produit/"+identifier;
	
	
	Document d = RequestBuilder.build().method(METHOD.GET)
			   .setClient(URLTools.newClient())
			   .url(url)
			   .toHtml();
	
	Elements trs = d.select("table.table-product tr");
	
	if(trs.isEmpty())
		return null;
	
	
	
	Grading grad = new Grading();
			grad.setGraderName(getName());
			grad.setNumberID(identifier);
			grad.setUrlInfo(url);
			
	trs.remove(0);
	
	logger.debug("found " + trs.text());
	
	grad.setGradeNote(Double.parseDouble(trs.select("td").get(3).text()));
	
	
	return grad;
}