Java Code Examples for org.jsoup.Jsoup.parse()

The following are Jave code examples for showing how to use parse() of the org.jsoup.Jsoup class. You can vote up the examples you like. Your votes will be used in our system to get more good examples.
+ Save this method
Example 1
Project: PicKing   File: Mzitu.java   View Source Code Vote up 6 votes
@Override
public Map<ContentsActivity.parameter, Object> getContent(String baseUrl, String currentUrl, byte[] result, Map<ContentsActivity.parameter, Object> resultMap) throws UnsupportedEncodingException {
    List<AlbumInfo> data = new ArrayList<>();
    Document document = Jsoup.parse(new String(result, "utf-8"));
    Elements elements = document.select("#pins a:has(img)");
    for (Element element : elements) {
        AlbumInfo temp = new AlbumInfo();
        temp.setAlbumUrl(element.attr("href"));
        Elements elements1 = element.select("img");
        if (elements1.size() > 0) {
            Log.e("Mzitu", "getContent: " + elements1.get(0).attr("data-original"));
            temp.setPicUrl(elements1.get(0).attr("data-original").replace("http", "https"));
        }
        data.add(temp);
    }
    resultMap.put(ContentsActivity.parameter.CURRENT_URL, currentUrl);
    resultMap.put(ContentsActivity.parameter.RESULT, data);
    return resultMap;
}
 
Example 2
Project: zhkuas_ssm_maven   File: SubmitCourseResultAnalysiser.java   View Source Code Vote up 6 votes
public String doAnalysis(String html){
	if (html == null)
		return null;
	Document doc = null;
	doc = Jsoup.parse(html);
	Elements tables = doc.select("table");
	if (tables.size() < 1)
		return "获取不到选课结果";
	Element table = tables.get(0);
	Elements fonts = table.select("td").select("font");
	if (fonts.size() == 0)
		return "获取不到选课结果";
	String result = "";
	if (fonts.size() == 1)
		return fonts.get(0).html();
	for (Element font : fonts) {
		result += font.html() + "</br>";
	}
	return result;
}
 
Example 3
Project: PicKing   File: Yande.java   View Source Code Vote up 6 votes
@Override
public Map<ContentsActivity.parameter, Object> getContent(String baseUrl, String currentUrl, byte[] result, Map<ContentsActivity.parameter, Object> resultMap) throws UnsupportedEncodingException {
    List<AlbumInfo> data = new ArrayList<>();
    Document document = Jsoup.parse(new String(result, "utf-8"));
    Elements elements = document.select("#post-list-posts li div.inner a");
    for (Element element : elements) {
        AlbumInfo temp = new AlbumInfo();
        temp.setAlbumUrl(baseUrl + element.attr("href"));
        Elements elements1 = element.select("img");
        if (elements1.size() > 0)
            temp.setPicUrl(elements1.get(0).attr("src"));
        data.add(temp);
    }

    resultMap.put(ContentsActivity.parameter.CURRENT_URL, currentUrl);
    resultMap.put(ContentsActivity.parameter.RESULT, data);
    return resultMap;
}
 
Example 4
Project: exam   File: Question.java   View Source Code Vote up 6 votes
@Transient
private String getClozeTestQuestionContentValidationResult(JsonNode node) {
    String reason = null;
    String questionText = node.get("question").asText();
    if (!questionText.contains("cloze=\"true\"")) {
        reason = "no embedded answers";
    } else {
        Document doc = Jsoup.parse(questionText);
        Elements answers = doc.select("span[cloze=true]");
        Set<String> distinctIds = answers.stream().map(a -> a.attr("id")).collect(Collectors.toSet());
        if (answers.size() != distinctIds.size()) {
            reason = "duplicate ids found";
        } else if (answers.stream()
                .map(a -> a.attr("precision"))
                .anyMatch(p -> p.isEmpty() || !NumberUtils.isParsable(p))) {
            reason = "invalid precision found";
        } else if (answers.stream()
                .filter(a -> a.attr("numeric").equals("true"))
                .map(Element::text)
                .anyMatch(t -> !NumberUtils.isParsable(t))) {
            reason = "non-numeric correct answer for numeric question";
        }
    }
    return reason;
}
 
Example 5
Project: PicKing   File: Aitaotu.java   View Source Code Vote up 6 votes
@Override
public Map<DetailActivity.parameter, Object> getDetailContent(String baseUrl, String currentUrl, byte[] result, Map<DetailActivity.parameter, Object> resultMap) throws UnsupportedEncodingException {
    List<PicInfo> urls = new ArrayList<>();
    Document document = Jsoup.parse(new String(result, "utf-8"));
    Elements title = document.select("#photos h1");
    String sTitle = "";
    if (title.size() > 0)
        sTitle = title.get(0).text();

    Elements time = document.select(".tsmaincont-desc span");
    String sTime = "";
    if (time.size() > 0)
        sTime = time.get(0).text();

    Elements elements = document.select("#big-pic img");
    for (Element element : elements) {
        urls.add(new PicInfo(element.attr("src")).setTitle(sTitle).setTime(sTime));
    }

    resultMap.put(DetailActivity.parameter.CURRENT_URL, currentUrl);
    resultMap.put(DetailActivity.parameter.RESULT, urls);
    return resultMap;
}
 
Example 6
Project: zhkuas_ssm_maven   File: HTMLUtil.java   View Source Code Vote up 6 votes
public static Map<String, String> getFormMap_Kingo(String html, int formIndex) {
	Map<String, String> retVal = new HashMap<String, String>();
	try {
		Document doc = Jsoup.parse(html);
		Elements elements = doc.select("form");
		Element formElement = elements.get(formIndex);
		retVal.put("formAction", formElement.attr("action"));
		Elements inputElements = doc.select("input");
		// System.out.println(inputElements);
		for (Element element : inputElements) {
			if (element.nodeName().equals("select")) {
				Element element5 = inputElements.select("option").first();
				retVal.put(element.attr("name"), element5.attr("value"));
			} else {
				if (element.attr("name").equals("") || element.attr("name") == null) {
				} else {
					retVal.put(element.attr("name"), element.attr("value"));
				}
			}
		}
	} catch (Exception e) {
		retVal = null;
	}
	return retVal;
}
 
Example 7
Project: zhkuas_ssm_maven   File: HTMLUtil.java   View Source Code Vote up 6 votes
/**
 * 对与页面 的 一些动态通过js填充内容的select 的内容进行 提取,并封装成Doc 元素
 * @param html
 * @return
 */
public static Element getSelectorByName(String html,String selectName){
	if(html==null) return null;
	Document doc = Jsoup.parse(html);
	Elements selectors =null;
	//先去页面拿 ,如果拿不到,或者拿到的是空的 列表,则在js拿
	selectors=doc.select("select[name="+selectName+"]");
	if(selectors!=null&&selectors.select("option").size()>0&&selectors.text()!=null&&!selectors.text().trim().equals("")){
		return selectors.first();
	} 
	//首先去js里面拿,拿不到再去页面拿
	selectors=doc.select("script");
	if(selectors!=null&&selectors.size()>0){
		String seletorHtml=selectors.html().replaceAll("[\\s\\S]*(<select[\\w\\W]*>[\\w\\W]+</select>)", "$1");
		Document docTemp = Jsoup.parse("<html>"+seletorHtml+"</html>");
		return docTemp.select("select[name="+selectName+"]").first();
	}
	
	return null;
}
 
Example 8
Project: ZhihuQuestionsSpider   File: KuaidailiProxySite.java   View Source Code Vote up 5 votes
@Override
public List<Proxy> parseProxys(String content) {
    Document doc = Jsoup.parse(content);
    Elements elements = doc.select("div#list table tbody tr");
    List<Proxy> proxyList = new ArrayList<>();
    for(Element tr : elements){
        Elements tds = tr.children();
        String ip = tds.get(0).text().trim();
        Integer port = Integer.parseInt(tds.get(1).text());
        proxyList.add(new Proxy(ip,port));
    }
    return proxyList;
}
 
Example 9
Project: crawler-jsoup-maven   File: CSDNQA.java   View Source Code Vote up 5 votes
public static void jsoupIOTest03() throws IOException{

//        String h = "<dl class='test'>" +
//                   "  <dt>"+
//                   "    Category"+
//                   "  </dt>"+
//                   "  <dd> "+
//                   "    <a href='/free'>Free</a>" + 
//                   "  </dd> ";
        
        String h =   " <html>" +
                  "   <head>" +
                    "     <title>JsoupInputAndOutput</title>" + 
                    "   </head>" +
                    "     <body> hhhh<ACTxxx<body>" +
                    " </html>";
        
        Document d = Jsoup.parse(h);
        
        System.out.println(d.body().html());
        
//        String s2 = d.select("a").toString();
//        System.out.println(s2);
//        System.out.println(d.select("a").remove("a"));
//        System.out.println(d.select("a").removeAttr("href"));
//        System.out.println(d.select("a").removeAttr("a"));
            
    }
 
Example 10
Project: PicKing   File: Meizi4493.java   View Source Code Vote up 5 votes
@Override
public String getContentNext(String baseUrl, String currentUrl, byte[] result) throws UnsupportedEncodingException {
    Document document = Jsoup.parse(new String(result, "gb2312"));
    Elements elements = document.select("div.page a:containsOwn(下一页),div.page a:containsOwn(>)");
    if (elements.size() > 0) {
        if (elements.size() > 0) {
            Pattern pattern = Pattern.compile("http.*/");
            Matcher matcher = pattern.matcher(currentUrl);
            if (matcher.find()) {
                return matcher.group() + elements.get(0).attr("href");
            }
        }
    }
    return "";
}
 
Example 11
Project: wulkanowy   File: StudentAndParentTest.java   View Source Code Vote up 5 votes
@Test(expected = NotLoggedInErrorException.class)
public void getSnpPageUrlWithWrongPage() throws Exception {
    Document wrongPageDocument = Jsoup.parse(
            FixtureHelper.getAsString(getClass().getResourceAsStream("OcenyWszystkie-semester.html"))
    );

    Mockito.when(snp.getPageByUrl(Mockito.anyString())).thenReturn(wrongPageDocument);
    Mockito.when(snp.getStartPageUrl()).thenReturn("http://wulkan.io");
    Mockito.when(snp.getId()).thenCallRealMethod();

    Mockito.when(snp.getSnpPageUrl()).thenCallRealMethod();

    snp.getSnpPageUrl();
}
 
Example 12
Project: LeMondeRssReader   File: ArticleActivity.java   View Source Code Vote up 5 votes
@Override
public void onResponse(String response) {
    // Hide icon
    findViewById(R.id.noNetwork).setVisibility(View.INVISIBLE);

    Document doc = Jsoup.parse(response);

    // If article was loaded from an external App, no image was passed from MainActivity,
    // so it must be fetched in the Collapsing Toolbar
    if (Intent.ACTION_VIEW.equals(getIntent().getAction())) {
        Elements image = doc.select("meta[property=og:image]");
        if (atLeastOneChild(image)) {
            Picasso.with(ArticleActivity.this)
                    .load(image.first().attr("content"))
                    .into((ImageView) findViewById(R.id.imageArticle));
        }
    }

    // Article is from a hosted blog
    List<Model> items;
    Element content = doc.getElementById("content");
    if (content != null) {
        items = extractBlogArticle(content);
        setTagInHeader(R.string.blog_article, R.color.accent_complementary, Color.WHITE);
    } else {
        Elements category = doc.select("div.tt_rubrique_ombrelle");
        if (atLeastOneChild(category)) {
            Log.d(TAG, "Cat: " + category.text());
            setTitle(category.text());
        }
        Elements articles = doc.getElementsByTag("article");
        Element largeFormat = doc.getElementById("hors_format");
        if (largeFormat != null) {
            items = new ArrayList<>();
            setTagInHeader(R.string.large_article, R.color.primary_dark, Color.WHITE);
        } else if (articles.isEmpty()) {
            // Video
            items = extractVideo(doc);
            setTagInHeader(R.string.video_article, R.color.accent_complementary, Color.WHITE);
        } else {
            // Standard article
            items = extractStandardArticle(articles);
            // Full article is restricted to paid members
            if (doc.getElementById("teaser_article") != null) {
                if (menu != null) {
                    MenuItem menuItem = menu.findItem(R.id.action_share);
                    if (menuItem != null) {
                        menuItem.setIcon(getResources().getDrawable(R.drawable.ic_share_black));
                    }
                } else {
                    Log.e(TAG, "menu should not be null at this point!");
                }

                CollapsingToolbarLayout collapsingToolbar = findViewById(R.id.collapsing_toolbar);
                collapsingToolbar.setContentScrimResource(R.color.accent);
                setTagInHeader(R.string.paid_article, R.color.accent, Color.BLACK);

                if (getSupportActionBar() != null) {
                    final Drawable upArrow = getResources().getDrawable(R.drawable.ic_arrow_back_black_24dp);
                    getSupportActionBar().setHomeAsUpIndicator(upArrow);
                }
            }
            // After parsing the article, start a new request for comments
            Element react = doc.getElementById("liste_reactions");
            if (react != null) {
                Elements dataAjURI = react.select("[^data-aj-uri]");
                if (atLeastOneChild(dataAjURI)) {
                    String commentPreviewURI = Constants.BASE_URL2 + dataAjURI.first().attr("data-aj-uri");
                    REQUEST_QUEUE.add(new StringRequest(Request.Method.GET, commentPreviewURI, commentsReceived, errorResponse));
                }
            }
        }
    }
    articleAdapter.insertItems(items);
    findViewById(R.id.articleLoader).setVisibility(View.GONE);
}
 
Example 13
Project: okhttp-byte-counter   File: Crawler.java   View Source Code Vote up 5 votes
public void fetch(HttpUrl url) throws IOException {
  // Skip hosts that we've visited many times.
  AtomicInteger hostnameCount = new AtomicInteger();
  AtomicInteger previous = hostnames.putIfAbsent(url.host(), hostnameCount);
  if (previous != null) hostnameCount = previous;
  if (hostnameCount.incrementAndGet() > 100) return;

  Request request = new Request.Builder()
      .url(url)
      .build();
  Response response = client.newCall(request).execute();
  int responseCode = response.code();

  String contentType = response.header("Content-Type");
  if (responseCode != 200 || contentType == null) {
    response.body().close();
    return;
  }

  MediaType mediaType = MediaType.parse(contentType);
  if (mediaType == null || !mediaType.subtype().equalsIgnoreCase("html")) {
    response.body().close();
    return;
  }

  Document document = Jsoup.parse(response.body().string(), url.toString());
  for (Element element : document.select("a[href]")) {
    String href = element.attr("href");
    HttpUrl link = response.request().url().resolve(href);
    if (link == null) continue; // URL is either invalid or its scheme isn't http/https.
    if (!"google.com".equals(link.topPrivateDomain())) continue;

    queue.add(link.newBuilder().fragment(null).build());
  }
}
 
Example 14
Project: PicKing   File: Aitaotu.java   View Source Code Vote up 5 votes
@Override
public String getContentNext(String baseUrl, String currentUrl, byte[] result) throws UnsupportedEncodingException {
    Document document = Jsoup.parse(new String(result, "utf-8"));
    Elements elements = document.select("#pageNum a:containsOwn(下一页)");
    if (elements.size() > 0)
        return baseUrl + elements.get(0).attr("href");
    return "";
}
 
Example 15
Project: vscrawler   File: XpathNode.java   View Source Code Vote up 5 votes
@Override
public SipNodes createOrGetModel() {
    if (model == null) {
        try {
            Document document = Jsoup.parse(getRawText(), getBaseUrl());
            if (document == null) {
                throw new RuntimeException();
            }
            model = new SipNodes(SIPNode.e(document));
        } catch (Exception e) {
            model = new SipNodes(SIPNode.t(getRawText()));
        }
    }
    return model;
}
 
Example 16
Project: reactive-ms-example   File: StaticRouterTests.java   View Source Code Vote up 4 votes
private void verifyTitleIs(final String html, final String title) {
    Document doc = Jsoup.parse(html);
    Element element = doc.head().getElementsByTag(TITLE_TAG).get(0);
    String text = element.text();
    assertThat(text, is(title));
}
 
Example 17
Project: xq_seckill_microservice   File: LianJiaCheckCallable.java   View Source Code Vote up 4 votes
/**
 * 按区域抓取数据
 *
 * @param regionLink
 * @throws Exception
 */
private void fetchData(String regionLink) throws Exception {
    int sleepTime = 0;
    String regionHtml = LianJiaCrawler.INSTANCE.doGet(lianJiaParam.getBaseUrl() + regionLink, lianJiaParam.getAppProperties().getLiaJiaCookie());
    Document document = Jsoup.parse(regionHtml);
    if (LianJiaCrawler.INSTANCE.checkValidHtml(document)) {
        Element pageElement = document.select("div[comp-module='page']").first();
        // 获取当前页链接
        String curPageUrl;
        String pageUrl = pageElement.attr("page-url");
        Map pageData = JSON.parseObject(pageElement.attr("page-data"), Map.class);
        int curPage = MapUtils.getIntValue(pageData, "curPage");
        int totalPage = MapUtils.getIntValue(pageData, "totalPage") + 1;
        // 抓取每页数据
        while (curPage <= totalPage) {
            curPageUrl = pageUrl.replace("{page}", String.valueOf(curPage));
            String listHtml = LianJiaCrawler.INSTANCE.doGet(lianJiaParam.getBaseUrl() + curPageUrl, lianJiaParam.getAppProperties().getLiaJiaCookie());
            document = Jsoup.parse(listHtml);
            if (LianJiaCrawler.INSTANCE.checkValidHtml(document)) {
                int count = 1;
                Elements contentElements = document.select("ul[class='sellListContent'] > li");
                Elements bigImgElements = document.select("div[class='bigImgList'] > div");

                for (Element contentElement : contentElements) {
                    LOGGER.info("开始检查[{}],第{}页,第{}条记录", regionLink, curPage, count);
                    // 解析列表数据
                    EstateItemDTO dto = LianJiaCrawler.INSTANCE.parseListData(contentElement);
                    // 解析默认图片
                    dto = LianJiaCrawler.INSTANCE.parseCoverImgData(dto, bigImgElements);

                    dto.setBatch(lianJiaParam.getBatch());
                    // 转为二进制数据
                    byte[] dataArray = ProtoStuffUtil.serialize(dto);
                    // 发送消息
                    mqProducer.sendQueueMessage(dataArray, queueName);
                    count++;
                }
            }
            curPage++;

            if (curPage % 5 == 0) {
                sleepTime = 2000;
                Thread.sleep(sleepTime);
                LOGGER.info("休眠了{}毫秒", sleepTime);
            } else {
                sleepTime = new Random().nextInt(200);
                Thread.sleep(sleepTime);
                LOGGER.info("休眠了{}毫秒", sleepTime);
            }
        }
    }
}
 
Example 18
Project: Babler   File: RSSScraper.java   View Source Code Vote up 4 votes
public AbstractMap.SimpleEntry<Integer, Integer> fetchAndSave() throws Exception {

        URL url = new URL(this.url);

        SyndFeedInput input = new SyndFeedInput();
        SyndFeed feed = input.build(new XmlReader(url));


        int items = feed.getEntries().size();

        if(items > 0){
            log.info("Attempting to parse rss feed: "+ this.url );
            log.info("This Feed has "+items +" items");
        }

        List <SyndEntry> entries = feed.getEntries();

        for (SyndEntry item : entries){
            log.info("Title: " + item.getTitle());
            log.info("Link: " + item.getLink());
            SyndContentImpl contentHolder = (SyndContentImpl) item.getContents().get(0);
            String content = contentHolder.getValue();

            //content might contain html data, let's clean it up
            Document doc = Jsoup.parse(content);
            content = doc.text();
            try {
                    Result result = ld.detectLanguage(content, language);
                    if (result.languageCode.equals(language) && result.isReliable) {

                        FileSaver file = new FileSaver(content, this.language, "bs", item.getLink(), item.getUri(), String.valueOf(content.hashCode()));
                        String fileName = file.getFileName();
                        BlogPost post = new BlogPost(content,this.language,null,"bs",item.getLink(),item.getUri(),fileName);
                        if(DAO.saveEntry(post)) {
                            file.save(this.logDb);
                            numOfFiles++;
                            wrongCount = 0;
                        }

                    }

                    else{
                        log.info("Item " + item.getTitle() + "is in a diff languageCode, skipping this post  "+ result.languageCode);
                        wrongCount ++;
                        if(wrongCount > 3){
                            log.info("Already found 3 posts in the wrong languageCode, skipping this blog");
                        }
                        break;
                    }

            }
            catch(Exception e){
                log.error(e);
                break;
            }


        }
        return new AbstractMap.SimpleEntry<>(numOfFiles,wrongCount);
    }
 
Example 19
Project: webpage-update-subscribe   File: PageParser.java   View Source Code Vote up 4 votes
public static Elements getLinks(String html, String baseUri) {
	String filtHtml = htmlFilter(html,baseUri);
	Document doc = Jsoup.parse(filtHtml);
   
    return linkFilter(doc);
}
 
Example 20
Project: Babler   File: TEDScraper.java   View Source Code Vote up 4 votes
/**
 * Parses all the pages containing links to talks in a specific languageCode
 * saves it to urls and then calls getAndSaveData()
 * @param language destination languageCode
 */
public TEDScraper(String language) {
    this.language = language;
    String iso1Lang = LanguageCode.convertIso2toIso1(language);
    this.logDb = new LogDB(this.language); //saving text files

    urls = new ArrayList<URL>(2);


        log.info("Scraping TED.COM for subtitles in:  "+ language);

        //get the first page and parse
        HTTPClient client = new HTTPClient(VIDEOS_URL + iso1Lang);
        String html = client.getHTMLData();

        if(html.contains("We couldn't find a talk quite like that")){
            log.info("TED.COM Does not have any talks in "+language + " langauge code");
            log.info("Stopping process");
            return;
        }


        Document doc = Jsoup.parse(html);

        //get the number of pages from the page's pagination
        Element lastPagination = doc.select(".pagination__item").last();

        int numOfPages = 1;
        if(lastPagination != null)
            numOfPages = Integer.parseInt(lastPagination.text());


        //for every page of that languageCode
        for (int i = 1; i <= numOfPages; i++) {
            log.info("Getting links from page: "+i +" out of: "+numOfPages);

            //we already fetched the first page
            if (i != 1) {
                //get the page and parse
                client = new HTTPClient(VIDEOS_URL + iso1Lang + "&page=" + i);
                html = client.getHTMLData();
                doc = Jsoup.parse(html);
            }

            Elements videoContainers = doc.select(".media__message");
            Elements links = videoContainers.select("a");

            //add href value only to urls
            for (Element link : links) {
                //get the href value
                String modifiedLink = link.attr("href");
                //remove everything after the ? -> /talks/ze_frank_are_you_human?languageCode=lt
                modifiedLink = modifiedLink.substring(0, modifiedLink.indexOf("?"));
                //add to array of all links
                urls.add(new URL("https://www.ted.com" + modifiedLink + "/transcript.json?language=" + iso1Lang, language,modifiedLink.substring((modifiedLink.indexOf("/talks/")+"/talks/".length()),modifiedLink.length())));
  //urls.add(new URL("https://www.ted.com" + modifiedLink + "/transcript.json?language=en", "eng", modifiedLink.substring((modifiedLink.indexOf("/talks/")+"/talks/".length()),modifiedLink.length())));
            }

        }

    getAndSaveData();
}