Java Code Examples for org.jsoup.select.Elements#remove()

The following examples show how to use org.jsoup.select.Elements#remove() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CSSReverter.java    From BlogManagePlatform with Apache License 2.0 6 votes vote down vote up
/**
 * 将html中外联的css变成内联,并去掉外联样式
 * @author Frodez
 * @date 2019-03-21
 */
@Override
public String revert(String html) {
	Assert.notNull(html, "html must not be null");
	try {
		Document document = Jsoup.parse(html);
		Elements links = document.select("link[href]");
		Elements htmlElement = document.select("html");
		for (Element iter : links) {
			String path = iter.attr("href");
			if (!path.endsWith(".css")) {
				continue;
			}
			htmlElement.prepend(StrUtil.concat("<style type=\"text/css\">", FileUtil.readString(ResourceUtils
				.getFile(StrUtil.concat(FreemarkerRender.getLoaderPath(), path))), "</style>"));
		}
		links.remove();
		return document.html();
	} catch (Exception e) {
		log.error("[frodez.util.renderer.reverter.CSSReverter.revert]", e);
		return html;
	}
}
 
Example 2
Source File: WeiboHotProcessor.java    From hot-crawler with MIT License 6 votes vote down vote up
@Override
protected List<Info> getInfoDataByElements(Elements elements) {
    List<Info> list = new ArrayList<>();
    if (elements != null) {
        // remove two tr elements
        elements.remove(0);
        elements.remove(0);
        int i = 0;
        for (Element element : elements) {
            Element itemElement = element.getElementsByClass("td-02").get(0).getElementsByTag("a").get(0);
            String id = String.valueOf(++i);
            String infoUrl = itemElement.attr("href");
            String infoTitle = itemElement.html();
            infoUrl = this.prefix + infoUrl;
            list.add(new Info(id, infoTitle, infoUrl));
        }
    }
    return list;
}
 
Example 3
Source File: MTGoldFishDashBoard.java    From MtgDesktopCompanion with GNU General Public License v3.0 5 votes vote down vote up
@Override
public List<CardDominance> getBestCards(MagicFormat.FORMATS f, String filter) throws IOException {

	// spells, creatures, all, lands
	
	String u = getString(WEBSITE) + "/format-staples/" + f.name().toLowerCase() + "/full/" + filter;
	
	if(f == MagicFormat.FORMATS.COMMANDER)
		u=getString(WEBSITE) + "/format-staples/commander_1v1/full/" + filter;
	
	Document doc = URLTools.extractHtml(u);

	logger.debug("get best cards : " + u);
	Elements trs = doc.select("table tr");
	trs.remove(0);
	trs.remove(0);
	List<CardDominance> ret = new ArrayList<>();
	for (Element e : trs) {
		Elements tds = e.select(MTGConstants.HTML_TAG_TD);
		try {
			int correct = filter.equalsIgnoreCase("lands") ? 1 : 0;

			CardDominance d = new CardDominance();
			d.setPosition(Integer.parseInt(tds.get(0).text()));
			d.setCardName(tds.get(1).text());
			d.setDecksPercent(Double.parseDouble(tds.get(3 - correct).text().replaceAll("\\%", "")));
			d.setPlayers(Double.parseDouble(tds.get(4 - correct).text().replaceAll("\\%", "")));
			
			ret.add(d);
		} catch (Exception ex) {
			logger.error("Error parsing " + tds, ex);
		}

	}
	return ret;
}
 
Example 4
Source File: TagServlet.java    From firing-range with Apache License 2.0 5 votes vote down vote up
@Override
public void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException {
  if (request.getParameter("q") == null) {
    Responses.sendError(response, "Missing q parameter", 400);
    return;
  }

  String  q = request.getParameter("q");
  Document doc = Jsoup.parseBodyFragment(q);
  Element body = doc.body();
  Elements elements = body.getAllElements();
  if (!(q.contains("body"))){
    elements.remove(body);
  }

  if (elements.isEmpty()) {
    Responses.sendError(response, "Invalid input, no tags", 400);
    return;
  }

  String allowedTag = "";
  String allowedAttribute = "";
  if (request.getPathInfo() != null) {
    String pathInfo = request.getPathInfo().substring(1);
    if (pathInfo.contains("/")) {
      allowedTag = pathInfo.split("/", 2)[0];
      allowedAttribute = pathInfo.split("/")[1];
    } else {
      allowedTag = pathInfo;
    }      
  }
  handleRequest(elements, response, allowedTag, allowedAttribute);
}
 
Example 5
Source File: DescendantSelector.java    From JsoupXpath with Apache License 2.0 5 votes vote down vote up
@Override
public XValue apply(Elements context) {
    Set<Element> total = new HashSet<>();
    Elements descendant = new Elements();
    for (Element el:context){
        Elements tmp = el.getAllElements();
        //exclude self
        tmp.remove(el);
        total.addAll(tmp);
    }
    descendant.addAll(total);
    return XValue.create(descendant);
}
 
Example 6
Source File: StructuralHtml.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
protected void writeBody(final JCas jCas, final Element body) {

  final Node<Structure> root = StructureHierarchy.build(jCas, structuralClasses).getRoot();

  walk(body, root);

  // We need to create the proper li tags under ol and ul
  body.select("ul > p").wrap("<li></li>");
  body.select("ol > p").wrap("<li></li>");

  // Correct table cells from td to th in header
  body.select("thead td").tagName("th");

  // Add &nbsp; to any empty td or th's
  body.select("td:empty,th:empty").html("&nbsp");

  if (!outputEmptyTags) {
    Elements e = emptyElements(body);
    while (!e.isEmpty()) {
      e.remove();
      e = emptyElements(body);
    }
  }

  // TODO: In accordance with HTML spec
  // - Captions for Table should be moved inside the table
  // - Captions for Figure should be moved inside the figure

}
 
Example 7
Source File: RemoveEmptyText.java    From baleen with Apache License 2.0 5 votes vote down vote up
private boolean removeEmpty(Element document) {
  Elements emptyNodes = document.select(":empty").not("body");
  if (emptyNodes.isEmpty()) {
    return true;
  }
  emptyNodes.remove();
  return false;
}
 
Example 8
Source File: CourseParse.java    From CourseScheduleDemo with MIT License 5 votes vote down vote up
public static List<Course> parsePersonal(String data){
    List<Course> courses = new ArrayList<>();
    Document doc = Jsoup.parse(data);
    //首先获取Table
    Element table = doc.getElementById("Table1");
    //然后获取table中的td节点
    Elements trs = table.select("tr");
    //移除不需要的参数,这里表示移除前两个数值。
    trs.remove(0);
    trs.remove(0);
    //遍历td节点
    for (int i=0; i<trs.size(); ++i){
        Element tr = trs.get(i);
        //获取tr下的td节点,要求
        Elements tds = tr.select("td[align]");
        //遍历td节点
        for(int j=0; j<tds.size(); ++j){
            Element td = tds.get(j);
            String str = td.text();
            //如果数值为空则不计算。
            if (str.length() != 1){
                //解析文本数据
                str = parsePersonalCourse(str);
                Course course = new Course();
                course.setClsName(str);
                course.setDay(j+1);
                course.setClsCount(Integer.valueOf(td.attr("rowspan")));
                course.setClsNum(i+1);
                Random random = new Random();
                int num = random.nextInt(COLOR.length);
                course.setColor(COLOR[num]);
                courses.add(course);
            }
        }
    }
    return courses;
}
 
Example 9
Source File: Comic.java    From HHComicViewer with Apache License 2.0 4 votes vote down vote up
public boolean checkUpdate(String content) {
        //查看是否有更新;
        //获取到网页内容时自动完善内容
        Document doc = Jsoup.parse(content);
        Element comicInfoDiv = doc.select("div[id=permalink]").first();

        this.title = comicInfoDiv.getElementsByTag("h1").first().text();

        Element about_kit = comicInfoDiv.select("div[id=about_kit]").first();
        Elements comicInfoList = about_kit.select("li");
        comicInfoList.remove(0);
        for (Element comicInfo : comicInfoList) {
            switch (comicInfo.getElementsByTag("b").first().text()) {
                case "作者:":
                    this.author = comicInfo.text().split(":")[1];
                    break;
                case "状态:":
                    this.comicStatus = comicInfo.text();
                    break;
                case "集数:":
                    this.comicStatus += (" " + comicInfo.text().split("\\)")[0] + ")");
                    break;
                case "更新:":
                    this.comicUpdateTime = comicInfo.text();
                    break;
                case "收藏:":
                    this.comicFavorite = comicInfo.text();
                    break;
                case "评价:":
                    this.ratingNumber = Float.valueOf(comicInfo.getElementsByTag("span").first().text());
                    this.ratingPeopleNum = Integer.valueOf(comicInfo.text().split("\\(")[1].split("人")[0]);
                    break;
                case "简介":
                    this.description = comicInfo.text();
                    break;
            }
        }

        //章节目录解析
        Element volListSrc = doc.select("div[class=cVolList]").first();
        Elements tagsSrc = volListSrc.select("div[class=cVolTag]");
        Elements tagchapterSrc = volListSrc.select("ul[class=cVolUl]");

        this.chapterName = new ArrayList<>();
        this.chapterId = new ArrayList<>();
        for (int i = 0; i < tagsSrc.size(); i++) {
//            this.tags.add(tagsSrc.get(i).text());
            Elements chaptersSrc = tagchapterSrc.get(i).select("a[class=l_s]");
//            tagCounts.add(chaptersSrc.size());
            for (int j = chaptersSrc.size() - 1; j > -1; j--) {
                //这个倒数循环把原本的倒序的章节顺序变为正序
                chapterName.add(chaptersSrc.get(j).attr("title"));
                //地址需要做一个变换,因为需要另外一个网站的网址,更好解析
                String urlSrc = chaptersSrc.get(j).attr("href");
                //图片服务器编号
                String domainNum = urlSrc.split("=")[1];
                //章节编号
                String chapterNum = urlSrc.split("/")[1].substring(4);
                chapterId.add(Long.parseLong(chapterNum));
                if (i == 0) {
                    serverId = Integer.parseInt(domainNum);
                }
            }
        }
        if (this.chapterCount != this.chapterName.size()) {
            this.isUpdate = true;
        }
        this.chapterCount = this.chapterName.size();
        return isUpdate;
    }
 
Example 10
Source File: EchoMTGDashBoard.java    From MtgDesktopCompanion with GNU General Public License v3.0 4 votes vote down vote up
@Override
protected EditionsShakers getOnlineShakesForEdition(MagicEdition ed) throws IOException {
	
	EditionsShakers variations = new EditionsShakers();
	variations.setDate(new Date());
	variations.setEdition(ed);
	variations.setProviderName(getName());
	
	Document d = RequestBuilder.build().method(METHOD.GET).setClient(client)
	 .url(EchoMTGExport.BASE_URL+"/set/"+ed.getId().toUpperCase()+"/"+ed.getSet().replace(" ", "-").toLowerCase()+"/")
	 .addHeader(URLTools.HOST, WEBSITE)
	 .addHeader(URLTools.REFERER, EchoMTGExport.BASE_URL)
	 .toHtml();
	
	
	Elements trs = d.select("table#set-table tr");
	trs.remove(trs.first());
	trs.remove(trs.last());
	
	trs.forEach(tr->{
		
		Elements tds = tr.getElementsByTag("td");
		CardShake cs = new CardShake();
				  cs.setEd(ed.getId());
				  cs.setName(tds.get(2).getElementsByTag("a").first().text());
			  
				  double price =Double.parseDouble(tds.get(4).getElementsByTag("a").first().attr("data-price"));
				  double lastWeekPrice = price;
				  
				  if(!tds.get(3).text().isEmpty())
				  {
					  double pc = Double.parseDouble(tds.get(3).text().replace("%",""))/100;
					  lastWeekPrice = price - (lastWeekPrice*pc);
				  }
				  cs.init(price, price, lastWeekPrice);
				  
				  
				  
				  
				  cs.setCurrency(getCurrency());
		variations.addShake(cs);
	});
	return variations;
}
 
Example 11
Source File: Comic.java    From HHComicViewer with Apache License 2.0 4 votes vote down vote up
public Comic(int cid, String content) {
        this.cid = cid;
        //获取到网页内容时自动完善内容
        Document doc = Jsoup.parse(content);
        Element comicInfoDiv = doc.select("div[class=product]").first();

        this.title = comicInfoDiv.getElementsByTag("h1").first().text();
        this.thumbnailUrl = comicInfoDiv.select("div[id=about_style]").first()
                .getElementsByTag("img").first().attr("src");

        Element about_kit = comicInfoDiv.select("div[id=about_kit]").first();
        Elements comicInfoList = about_kit.select("li");
        comicInfoList.remove(0);
        for (Element comicInfo : comicInfoList) {
            switch (comicInfo.getElementsByTag("b").first().text()) {
                case "作者:":
                    this.author = comicInfo.text().split(":")[1];
                    break;
                case "状态:":
                    this.comicStatus = comicInfo.text();
                    break;
                case "集数:":
                    this.comicStatus += (" " + comicInfo.text().split("\\)")[0] + ")");
                    break;
                case "更新:":
                    this.comicUpdateTime = comicInfo.text();
                    break;
                case "收藏:":
                    this.comicFavorite = comicInfo.text();
                    break;
                case "评价:":
                    this.ratingNumber = Float.valueOf(comicInfo.getElementsByTag("span").first().text());
                    this.ratingPeopleNum = Integer.valueOf(comicInfo.text().split("\\(")[1].split("人")[0]);
                    break;
                case "简介":
                    this.description = comicInfo.text();
                    break;
            }
        }

        //章节目录解析
        Element volListSrc = doc.select("div[class=cVolList]").first();
        Elements tagsSrc = volListSrc.select("div[class=cVolTag]");
        Elements tagchapterSrc = volListSrc.select("ul[class=cVolUl]");

        this.chapterName = new ArrayList<>();
        this.chapterId = new ArrayList<>();
        for (int i = 0; i < tagsSrc.size(); i++) {
//            this.tags.add(tagsSrc.get(i).text());
            Elements chaptersSrc = tagchapterSrc.get(i).select("a[class=l_s]");
//            tagCounts.add(chaptersSrc.size());
            for (int j = chaptersSrc.size() - 1; j > -1; j--) {
                //这个倒数循环把原本的倒序的章节顺序变为正序
                chapterName.add(chaptersSrc.get(j).attr("title"));
                //地址需要做一个变换,因为需要另外一个网站的网址,更好解析
                String urlSrc = chaptersSrc.get(j).attr("href");
                //图片服务器编号
                String domainNum = urlSrc.split("=")[1];
                //章节编号
                String chapterNum = urlSrc.split("/")[1].substring(4);
                chapterId.add(Long.parseLong(chapterNum));
                if (i == 0) {
                    serverId = Integer.parseInt(domainNum);
                }
            }
        }
        this.chapterCount = this.chapterName.size();
    }
 
Example 12
Source File: MagicBazarShopper.java    From MtgDesktopCompanion with GNU General Public License v3.0 4 votes vote down vote up
private List<OrderEntry> parse(Document doc, String id, Date date) {
	List<OrderEntry> entries = new ArrayList<>();
	Elements table = doc.select("div.table div.tr");
	table.remove(0);
	
	
	for(int i=0;i<table.size();i++)
	{
		Element e = table.get(i);
		boolean iscard=e.hasClass("filterElement");
		String name = e.select("div.td.name").text();
		
		
		if(!name.isEmpty())
		{

			OrderEntry entrie = new OrderEntry();
				entrie.setIdTransation(id);
				entrie.setSource(getName());
				entrie.setCurrency(Currency.getInstance("EUR"));
				entrie.setSeller(getName());
				entrie.setTypeTransaction(TYPE_TRANSACTION.BUY);
				entrie.setTransactionDate(date);
				entrie.setDescription(name);
				if(iscard)
				{
					entrie.setType(TYPE_ITEM.CARD);
					entrie.setDescription(e.select("div.td.name.name_mobile").text());
					entrie.setItemPrice(UITools.parseDouble(e.attr("attribute_price")));
					String set = e.select("div.td.ext img").attr("title");
					try {
						
						entrie.setEdition(MTGControler.getInstance().getEnabled(MTGCardsProvider.class).getSetByName(set));
					}
					catch(Exception ex)
					{
						logger.error(set + " is not found");
					}
					
					
				}
				else
				{
					String price =e.select("div.new_price").html().replaceAll("&nbsp;"+Currency.getInstance("EUR").getSymbol(), "").trim(); 
					entrie.setItemPrice(UITools.parseDouble(price));
					if(entrie.getDescription().contains("Set")||entrie.getDescription().toLowerCase().contains("collection"))
						entrie.setType(TYPE_ITEM.FULLSET);
					else if(entrie.getDescription().toLowerCase().contains("booster"))
						entrie.setType(TYPE_ITEM.BOOSTER);
					else if(entrie.getDescription().toLowerCase().startsWith("boite de") || entrie.getDescription().contains("Display") )
						entrie.setType(TYPE_ITEM.BOX);
					else
						entrie.setType(TYPE_ITEM.LOTS);
				}
				notify(entrie);
				entries.add(entrie);	
		}
		
		
		
	}
	
	
	
	return entries;
}
 
Example 13
Source File: BookClass.java    From nju-lib-downloader with GNU General Public License v3.0 4 votes vote down vote up
private Set<Book> queryBooks(Elements booksliNode) {
    Set<Book> books = new HashSet<>();
    for (Element element : booksliNode) {
        //获取书名和id
        String name = null, id = null, author = null, publishDate = null, theme = null, detailBookClass = null;
        BookClass bookBookClass;
        Elements nameIdNode = element.select("p[class=name]");
        if (nameIdNode != null) {
            name = nameIdNode.text();
            Elements idNode = nameIdNode.select("a[onclick]");
            if (idNode != null && idNode.size() > 0) {
                String idOnClick = idNode.get(0).attr("onclick");
                int start = idOnClick.indexOf("(") + 1, end = idOnClick.lastIndexOf(",");
                if (start != 0 && end != -1) {
                    id = idOnClick.substring(start, end);
                }
            }
        }
        //获取分类
        BookClass[] bookClasses = new BookClass[0];
        Elements infoNode = element.select("p[class=info]");
        if (infoNode != null) {
            Elements bookInfos = infoNode.select("a");
            if (bookInfos != null && bookInfos.size() > 0) {
                Element terminalCataNode = bookInfos.last();
                bookInfos.remove(terminalCataNode);
                List<BookClass> tmplist = bookInfos.stream()
                        .map(bookInfo -> getBookCata(bookInfo, false))
                        .filter(Objects::nonNull)
                        .collect(Collectors.toList());
                BookClass terminalBookClass = getBookCata(terminalCataNode, true);
                if (terminalBookClass != null) {
                    tmplist.add(terminalBookClass);
                }
                bookClasses = tmplist.toArray(bookClasses);
            }
        }
        bookBookClass = new RootBookClass().link(bookClasses);

        //获取作者,出版日期,主题词,分类
        String info = element.text();
        Pattern pattern = Pattern.compile("\\d+\\. (.*) 作者[::](.*) 出版日期[::](\\d+).*?(?:主题词[::](.+))? 分类[::](.*)");
        Matcher matcher = pattern.matcher(info);
        while (matcher.find()) {
            name = matcher.group(1);
            author = matcher.group(2);
            publishDate = matcher.group(3);
            theme = matcher.group(4);
            detailBookClass = matcher.group(5);
        }
        Pattern minPattern = Pattern.compile(".*(《.*》).*");
        Matcher minMatcher = minPattern.matcher(info);
        while (minMatcher.find()) {
            name = minMatcher.group(1);
        }

        //汇总书本
        if (name != null && id != null) {
            Book book = new Book(id, name, author, publishDate, theme, bookBookClass, detailBookClass);
            book.setCookie(cookie);
            books.add(book);
            if (bookBookClass.isTerminal()) {
                ((TerminalBookClass) bookBookClass).addBook(book);
            } else {
                System.out.println("未获取到分类信息,将不被归档 " + book);
            }
        } else {
            System.out.println("error: " + info);
        }
    }
    return books;
}
 
Example 14
Source File: BookClass.java    From nju-lib-downloader with GNU General Public License v3.0 4 votes vote down vote up
private Set<Book> queryBooks(Elements booksliNode) {
    Set<Book> books = new HashSet<>();
    for (Element element : booksliNode) {
        //获取书名和id
        String name = null, id = null, author = null, publishDate = null, theme = null, detailBookClass = null;
        BookClass bookBookClass;
        Elements nameIdNode = element.select("p[class=name]");
        if (nameIdNode != null) {
            name = nameIdNode.text();
            Elements idNode = nameIdNode.select("a[onclick]");
            if (idNode != null && idNode.size() > 0) {
                String idOnClick = idNode.get(0).attr("onclick");
                int start = idOnClick.indexOf("(") + 1, end = idOnClick.lastIndexOf(",");
                if (start != 0 && end != -1) {
                    id = idOnClick.substring(start, end);
                }
            }
        }
        //获取分类
        BookClass[] bookClasses = new BookClass[0];
        Elements infoNode = element.select("p[class=info]");
        if (infoNode != null) {
            Elements bookInfos = infoNode.select("a");
            if (bookInfos != null && bookInfos.size() > 0) {
                Element terminalCataNode = bookInfos.last();
                bookInfos.remove(terminalCataNode);
                List<BookClass> tmplist = bookInfos.stream()
                        .map(bookInfo -> getBookCata(bookInfo, false))
                        .filter(Objects::nonNull)
                        .collect(Collectors.toList());
                BookClass terminalBookClass = getBookCata(terminalCataNode, true);
                if (terminalBookClass != null) {
                    tmplist.add(terminalBookClass);
                }
                bookClasses = tmplist.toArray(bookClasses);
            }
        }
        bookBookClass = this.link(bookClasses);

        //获取作者,出版日期,主题词,分类
        String info = element.text();
        Pattern pattern = Pattern.compile("\\d+\\. (.*) 作者[::](.*) 出版日期[::](\\d+).*?(?:主题词[::](.+))? 分类[::](.*)");
        Matcher matcher = pattern.matcher(info);
        while (matcher.find()) {
            name = matcher.group(1);
            author = matcher.group(2);
            publishDate = matcher.group(3);
            theme = matcher.group(4);
            detailBookClass = matcher.group(5);
        }
        Pattern minPattern = Pattern.compile(".*(《.*》).*");
        Matcher minMatcher = minPattern.matcher(info);
        while (minMatcher.find()) {
            name = minMatcher.group(1);
        }

        //汇总书本
        if (name != null && id != null) {
            Book book = new Book(id, name, author, publishDate, theme, bookBookClass, detailBookClass);
            book.setCookie(cookie);
            books.add(book);
            if (bookBookClass.isTerminal()) {
                ((TerminalBookClass) bookBookClass).addBook(book);
            } else {
                System.out.println("未获取到分类信息,将不被归档 " + book);
            }
        } else {
            System.out.println("error: " + info);
        }
    }
    return books;
}
 
Example 15
Source File: JsoupHelper.java    From seed with Apache License 2.0 4 votes vote down vote up
/**
 * 抓取天涯论坛帖子内容
 * 目前只抓取楼主发言部分,且内容会存储到用户桌面的文章URL同名txt文件中
 * @param bbsURL      帖子地址(支持传入首页地址或本帖其它任意页面的地址)
 * @param finalPageNo 帖子的最大的页码(如传入页码超出实际最大页码,这里在抓取完最大页码内容后,会自动停止作业)
 */
private static void getTianyaBBSTxt(String bbsURL, int finalPageNo) throws IOException {
    String txt;
    String author;
    String publishTime;
    Element atlInfo;
    Elements elements;
    Document document;
    //去掉URL中的参数
    bbsURL = bbsURL.endsWith("shtml") ? bbsURL : bbsURL.substring(0, bbsURL.indexOf(".shtml")+6);
    //计算待写入的txt文件,并预先清空里面的内容(如果已存在)
    String filePath = FileSystemView.getFileSystemView().getHomeDirectory().getAbsolutePath();
    String fileName = bbsURL.substring(bbsURL.indexOf("post")).replace(".shtml", ".txt");
    File bbsFile = new File(filePath, fileName);
    FileUtils.writeStringToFile(bbsFile, "", StandardCharsets.UTF_8);
    //获取帖子的起始页码
    int pageNo = Integer.parseInt(bbsURL.substring(bbsURL.lastIndexOf("-")+1, bbsURL.lastIndexOf(".")));
    //开始处理所有页面的所有楼层
    for(int i=pageNo; i<finalPageNo; i++){
        if(i == 1){
            /*
             * 单独处理首层楼(首层楼只存在于首页)
             */
            document = Jsoup.connect(bbsURL).get();
            //读取作者和发布时间
            atlInfo = document.getElementById("post_head").select("div.atl-info").first();
            author = atlInfo.select("span").eq(0).select("a").first().text();
            publishTime = atlInfo.select("span").eq(1).text();
            //获取楼层内容:每一个<div class="atl-item"></div>都代表一个楼层,首层也不例外
            elements = document.getElementsByClass("atl-item");
            //楼层具体内容都是在<div class="bbs-content"></div>里面包着的
            txt = elements.first().select("div.bbs-content").html().replaceAll("<br>", "");
            //写入txt
            FileUtils.writeStringToFile(bbsFile, "楼主:"+author+","+publishTime+"\r\n", StandardCharsets.UTF_8, true);
            FileUtils.writeStringToFile(bbsFile, txt+"\r\n\r\n", StandardCharsets.UTF_8, true);
            //需要移除已处理过的首层楼
            elements.remove(0);
        }else{
            /*
             * 对于非首页的帖子,每次都需重新计算URL,并重新抓取内容
             */
            bbsURL = bbsURL.replace("-"+(i-1)+".shtml", "-"+i+".shtml");
            document = Jsoup.connect(bbsURL).get();
            //超出帖子最终页码的访问,会被天涯重定向到最终页码页
            if(!bbsURL.equals(document.location())){
                System.out.println("帖子抓取完毕");
                return;
            }
            //得到本页需要抓取的elements
            elements = document.getElementsByClass("atl-item");
        }
        /*
         * 上面两种条件,最终都是计算好本页需要迭代处理的elements
         */
        for(Element obj: elements){
            atlInfo = obj.select("div.atl-info").first();
            String authorType = atlInfo.select("strong.host").text();
            //作者类型为空就说明,该楼层非楼主发言,暂时不写入txt
            if(StringUtils.isNotBlank(authorType)){
                txt = obj.select("div.bbs-content").html().replaceAll("<br>", "");
                author = atlInfo.select("span").eq(0).select("a").first().text();
                publishTime = atlInfo.select("span").eq(1).text();
                FileUtils.writeStringToFile(bbsFile, authorType+":"+author+","+publishTime+"\r\n", StandardCharsets.UTF_8, true);
                FileUtils.writeStringToFile(bbsFile, txt+"\r\n\r\n", StandardCharsets.UTF_8, true);
            }
        }
    }
}
 
Example 16
Source File: Expression.java    From firing-range with Apache License 2.0 4 votes vote down vote up
@Override
public void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException {
  if (request.getParameter("q") == null) {
    Responses.sendError(response, "Missing q parameter", 400);
    return;
  }

  String  q = request.getParameter("q");
  Document doc = Jsoup.parseBodyFragment(q);
  Element body = doc.body();
  Elements elements = body.getAllElements();
  elements.remove(body);
  if (elements.isEmpty()) {
    Responses.sendError(response, "Invalid input, no tags", 400);
    return;
  }

  StringBuilder res = new StringBuilder();
  for (Element element : elements) {
    boolean validElement = true;

    Attributes attributes = element.attributes();
    for (Attribute attribute : attributes) {
      if (attribute.getKey().toLowerCase().startsWith("on")
          || attribute.getKey().toLowerCase().equals("href")
          || attribute.getKey().toLowerCase().equals("src")) {
        validElement = false;
      }

      if (attribute.getKey().toLowerCase().equals("style")
          && attribute.getValue().toLowerCase().contains("expression")) {
        validElement = false;
      }
    }

    if (validElement) {
      res.append(element.toString());
    }
  }
  Responses.sendXssed(response, res.toString());
}
 
Example 17
Source File: TtsHelper.java    From coolreader with MIT License 4 votes vote down vote up
private void removeAllChildren(Element el, Elements elements) {
	for (Element child : el.getAllElements()) {
		elements.remove(child);
	}
}
 
Example 18
Source File: MTGGradeGrader.java    From MtgDesktopCompanion with GNU General Public License v3.0 3 votes vote down vote up
@Override
public Grading loadGrading(String identifier) throws IOException {
	
	String url=getWebSite()+"/produit/"+identifier;
	
	
	Document d = RequestBuilder.build().method(METHOD.GET)
			   .setClient(URLTools.newClient())
			   .url(url)
			   .toHtml();
	
	Elements trs = d.select("table.table-product tr");
	
	if(trs.isEmpty())
		return null;
	
	
	
	Grading grad = new Grading();
			grad.setGraderName(getName());
			grad.setNumberID(identifier);
			grad.setUrlInfo(url);
			
	trs.remove(0);
	
	logger.debug("found " + trs.text());
	
	grad.setGradeNote(Double.parseDouble(trs.select("td").get(3).text()));
	
	
	return grad;
}