Java Code Examples for org.jsoup.Jsoup#parse()

The following examples show how to use org.jsoup.Jsoup#parse() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Nanrentu.java    From PicKing with Apache License 2.0 6 votes vote down vote up
@Override
public String getDetailNext(String baseUrl, String currentUrl, byte[] result) throws UnsupportedEncodingException {
    Document document = Jsoup.parse(new String(result, "gb2312"));
    Elements elements = document.select("div.pagelist a:contains(下一页)");
    if (elements.size() > 0) {
        String tempUrl = elements.get(0).attr("href");
        if (tempUrl == null || "".equals(tempUrl))
            return "";
        Pattern pattern = Pattern.compile("http.*/");
        Matcher matcher = pattern.matcher(currentUrl);
        if (matcher.find()) {
            return matcher.group() + tempUrl;
        }
    }
    return "";
}
 
Example 2
Source File: ElementTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test public void testGetElementsWithClass() {
    Document doc = Jsoup.parse("<div class='mellow yellow'><span class=mellow>Hello <b class='yellow'>Yellow!</b></span><p>Empty</p></div>");

    List<Element> els = doc.getElementsByClass("mellow");
    assertEquals(2, els.size());
    assertEquals("div", els.get(0).tagName());
    assertEquals("span", els.get(1).tagName());

    List<Element> els2 = doc.getElementsByClass("yellow");
    assertEquals(2, els2.size());
    assertEquals("div", els2.get(0).tagName());
    assertEquals("b", els2.get(1).tagName());

    List<Element> none = doc.getElementsByClass("solo");
    assertEquals(0, none.size());
}
 
Example 3
Source File: Job51ResumeParser.java    From job with MIT License 6 votes vote down vote up
protected Document parse2HtmlAsMail(File file) throws Exception {
  InputStream in = new FileInputStream(file);

  Session mailSession = Session.getDefaultInstance(System.getProperties(), null);

  MimeMessage msg = new MimeMessage(mailSession, in);
  
  Multipart part = (Multipart) msg.getContent();
  String html = null;
  for(int i = 0; i < part.getCount(); i++) {
    html = parseHtml(part.getBodyPart(i));
    if(html != null) {
      break;
    }
  }
  in.close();
  return html == null ? null : Jsoup.parse(html);
}
 
Example 4
Source File: ElementTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test
public void testAppendTo() {
	String parentHtml = "<div class='a'></div>";
	String childHtml = "<div class='b'></div><p>Two</p>";

	Document parentDoc = Jsoup.parse(parentHtml);
	Element parent = parentDoc.body();
       Document childDoc = Jsoup.parse(childHtml);

       Element div = childDoc.select("div").first();
       Element p = childDoc.select("p").first();
       Element appendTo1 = div.appendTo(parent);
       assertEquals(div, appendTo1);

       Element appendTo2 = p.appendTo(div);
       assertEquals(p, appendTo2);

       assertEquals("<div class=\"a\"></div>\n<div class=\"b\">\n <p>Two</p>\n</div>", parentDoc.body().html());
       assertEquals("", childDoc.body().html()); // got moved out
}
 
Example 5
Source File: Demo.java    From java-Crawler with MIT License 6 votes vote down vote up
public static Queue getUrlQueue(String url) throws Exception{
    Queue queue = new Queue() ;
    CloseableHttpClient closeableHttpClient = HttpClients.createDefault();
    HttpGet httpGet = new HttpGet(url) ;
    CloseableHttpResponse closeableHttpResponse = closeableHttpClient.execute(httpGet) ;
    HttpEntity httpEntity = closeableHttpResponse.getEntity() ;
    String index = EntityUtils.toString(httpEntity,"gb2312");

    Document doc = Jsoup.parse(index);
    Elements elements = doc.select("a");
    for(Element element : elements) {
        String aurl = element.attr("href");

        if(aurl.indexOf("webPlay")!=-1){

        }else {
            queue.enQueue("http://www.dy2018.com" + aurl);
        }
    }
    return queue ;
}
 
Example 6
Source File: NodeTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void unwrap() {
    Document doc = Jsoup.parse("<div>One <span>Two <b>Three</b></span> Four</div>");
    Element span = doc.select("span").first();
    Node twoText = span.childNode(0);
    Node node = span.unwrap();

    assertEquals("<div>One Two <b>Three</b> Four</div>", TextUtil.stripNewlines(doc.body().html()));
    assertTrue(node instanceof TextNode);
    assertEquals("Two ", ((TextNode) node).text());
    assertEquals(node, twoText);
    assertEquals(node.parent(), doc.select("div").first());
}
 
Example 7
Source File: LyricsChart.java    From QuickLyric with GNU General Public License v3.0 5 votes vote down vote up
public static Lyrics fromXml(String xmlString, String... originalMetadata) {
    if (TextUtils.isEmpty(xmlString))
        return new Lyrics(Lyrics.ERROR);
    Document doc = Jsoup.parse(xmlString);
    Element element = doc.getElementsByTag("GetLyricResult").first();
    String id = element.getElementsByTag("TrackId").get(0).text();
    String checksum = element.getElementsByTag("LyricChecksum").get(0).text();
    Lyrics lyrics = new Lyrics(Lyrics.POSITIVE_RESULT);
    lyrics.setArtist(element.getElementsByTag("LyricArtist").get(0).text());
    lyrics.setTitle(element.getElementsByTag("LyricSong").get(0).text());
    lyrics.setURL("http://api.chartlyrics.com/apiv1.asmx/GetLyric?lyricId=" + id + "&lyricCheckSum=" + checksum);
    boolean hasOriginalMetadata = originalMetadata != null && originalMetadata.length > 0;
    String originalArtist = hasOriginalMetadata ? originalMetadata[0] : null;
    String originalTitle = hasOriginalMetadata ? originalMetadata[1] : null;

    if (TextUtils.isEmpty(lyrics.getArtist()))
        lyrics.setArtist(originalArtist);
    else
        lyrics.setOriginalArtist(originalArtist);
    if (TextUtils.isEmpty(lyrics.getTitle()))
        lyrics.setTitle(originalTitle);
    else
        lyrics.setOriginalTitle(originalTitle);
    lyrics.setText(element.getElementsByTag("Lyric").get(0).html());
    lyrics.setSource(domain);
    return lyrics;
}
 
Example 8
Source File: HtmlParserTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void handlesJavadocFont() {
    String h = "<TD BGCOLOR=\"#EEEEFF\" CLASS=\"NavBarCell1\">    <A HREF=\"deprecated-list.html\"><FONT CLASS=\"NavBarFont1\"><B>Deprecated</B></FONT></A>&nbsp;</TD>";
    Document doc = Jsoup.parse(h);
    Element a = doc.select("a").first();
    assertEquals("Deprecated", a.text());
    assertEquals("font", a.child(0).tagName());
    assertEquals("b", a.child(0).child(0).tagName());
}
 
Example 9
Source File: ElementTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void testGetParents() {
    Document doc = Jsoup.parse("<div><p>Hello <span>there</span></div>");
    Element span = doc.select("span").first();
    Elements parents = span.parents();

    assertEquals(4, parents.size());
    assertEquals("p", parents.get(0).tagName());
    assertEquals("div", parents.get(1).tagName());
    assertEquals("body", parents.get(2).tagName());
    assertEquals("html", parents.get(3).tagName());
}
 
Example 10
Source File: PhotoContentPresenter.java    From Toutiao with Apache License 2.0 5 votes vote down vote up
private Boolean parseHTML(String HTML) {
    boolean flag = false;
    Document doc = Jsoup.parse(HTML);
    // 取得所有的script tag
    Elements scripts = doc.getElementsByTag("script");
    for (Element e : scripts) {
        // 过滤字符串
        String script = e.toString();
        if (script.contains("BASE_DATA.galleryInfo")) {
            // 只取得script的內容
            script = e.childNode(0).toString();

            Matcher matcher = Pattern.compile("(JSON.parse\\(\\\".+\\))").matcher(script);
            while (matcher.find()) {
                int count = matcher.groupCount();
                if (count >= 1) {
                    int start = script.indexOf("(");
                    int end = script.indexOf("),");
                    String json = script.substring(start + 2, end - 1);

                    // 处理特殊符号
                    json = ChineseUtil.UnicodeToChs(json);
                    json = json.replace("\\", "");
                    JsonReader reader = new JsonReader(new StringReader(json));
                    reader.setLenient(true);
                    bean = new Gson().fromJson(reader, PhotoGalleryBean.class);
                    Log.d(TAG, "parseHTML: " + bean.toString());
                    flag = true;
                    break;
                }
            }
        }
    }
    return flag;
}
 
Example 11
Source File: MultiUploadDotBiz.java    From neembuu-uploader with GNU General Public License v3.0 5 votes vote down vote up
private void initialize() throws Exception {
    responseString = NUHttpClientUtils.getData("http://multiupload.biz/", httpContext);
    
    doc = Jsoup.parse(responseString);
    uploadURL = doc.select("form#F1").attr("action");
    uploadId = doc.select("input[name=upload_id]").val();
    
    //Get all the services
    Elements elements = doc.select(".srvtbl input[checked]");
    ListIterator<Element> listIterator = elements.listIterator();
    while(listIterator.hasNext()){
        //NULogger.getLogger().log(Level.INFO, "Value of name: {0}", listIterator.next().attr("name"));
        services.add(listIterator.next().val());
    }
}
 
Example 12
Source File: Yande.java    From PicKing with Apache License 2.0 5 votes vote down vote up
@Override
public String getContentNext(String baseUrl, String currentUrl, byte[] result) throws UnsupportedEncodingException {
    Document document = Jsoup.parse(new String(result, "utf-8"));
    Log.e("Yande", "getSinglePicContent: " + new String(result, "utf-8"));
    Elements elements = document.select("div#paginator a.next_page");
    if (elements.size() > 0) {
        Log.e("Yande", "getContentNext: " + baseUrl + elements.get(0).attr("href"));
        return baseUrl + elements.get(0).attr("href");
    }
    return "";
}
 
Example 13
Source File: RelinkImagesTask.java    From coolreader with MIT License 4 votes vote down vote up
private void processImageInContents() {
	// get all contents
	ArrayList<PageModel> pages = NovelsDao.getInstance().getAllContentPageModel();
	updated = 0;
	int count = 1;
	for (PageModel page : pages) {
		String message = LNReaderApplication.getInstance().getApplicationContext().getResources().getString(R.string.relink_task_progress, page.getPage(), count, pages.size());
		publishProgress(new CallbackEventData(message));

		try {
			// get the contents
			NovelContentModel content = NovelsDao.getInstance().getNovelContent(new BookModel(), false, callback);

			if (content != null) {

				// replace the rootpath based on /project/
				// for now just replace the thumbs
				// file:///mnt/sdcard/test/project/images/thumb/c/c7/Accel_World_v01_262.jpg/84px-Accel_World_v01_262.jpg
				// file:///sdcard-ext/.bakareaderex/project/images/thumb/c/c7/Accel_World_v01_262.jpg/84px-Accel_World_v01_262.jpg

				Document doc = Jsoup.parse(content.getContent());
				Elements imageElements = doc.select("img");
				for (Element image : imageElements) {
					String imgUrl = image.attr("src");
					if (imgUrl.startsWith("file:///") && imgUrl.contains("/project/images/thumb/")) {
						String mntImgUrl = imgUrl.replace("file:///", "");
						Log.d(TAG, "Found image : " + imgUrl);

						if (!new File(mntImgUrl).exists()) {
							Log.d(TAG, "Old image doesn't exists/moved: " + mntImgUrl);
							String newUrl = imgUrl.replaceAll("file:///[\\w/\\./!$%^&*()_+|~\\={}\\[\\]:\";'<>?,-]+/project/images/thumb/", "file:///" + rootPath + "/project/images/thumb/");
							String mntNewUrl = newUrl.replace("file:///", "");
							Log.d(TAG, "Trying to replace with " + mntNewUrl);

							if (new File(mntNewUrl).exists()) {
								Log.d(TAG, "Replace image: " + imgUrl + " ==> " + newUrl);
								image.attr("src", newUrl);
								++updated;
							}
						}
					}
				}
				content.setContent(doc.html());
				NovelsDao.getInstance().updateNovelContent(content);

			}
		} catch (Exception e) {
			message = LNReaderApplication.getInstance().getApplicationContext().getResources().getString(R.string.relink_task_error, page.getPage());
			Log.e(TAG, message, e);
			publishProgress(new CallbackEventData(message));
		}
		++count;
	}
}
 
Example 14
Source File: TheVideoAccount.java    From neembuu-uploader with GNU General Public License v3.0 4 votes vote down vote up
@Override
public void login() {
    loginsuccessful = false;
    try {
        initialize();

        NULogger.getLogger().info("Trying to log in to TheVideo.me");
        httpPost = new NUHttpPost("http://www.thevideo.me/");

        List<NameValuePair> formparams = new ArrayList<NameValuePair>();
        formparams.add(new BasicNameValuePair("op", "login"));
        formparams.add(new BasicNameValuePair("login", getUsername()));
        formparams.add(new BasicNameValuePair("password", getPassword()));
        
        UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formparams, "UTF-8");
        httpPost.setEntity(entity);
        httpResponse = httpclient.execute(httpPost, httpContext);
        NULogger.getLogger().info(httpResponse.getStatusLine().toString());

        if (!CookieUtils.getCookieValue(httpContext, "xfsts").isEmpty() && !CookieUtils.getCookieValue(httpContext, "login").isEmpty()) {
            EntityUtils.consume(httpResponse.getEntity());
            loginsuccessful = true;
            username = getUsername();
            password = getPassword();
            hostsAccountUI().hostUI(HOSTNAME).setEnabled(true);
            NULogger.getLogger().info("TheVideo.me login successful!");

        } else {
            //Get error message
            responseString = EntityUtils.toString(httpResponse.getEntity());
            //FileUtils.saveInFile("TheVideoAccount.html", responseString);
            Document doc = Jsoup.parse(responseString);
            String error = doc.select(".err").first().text();
            
            if("Incorrect Login or Password".equals(error)){
                throw new NUInvalidLoginException(getUsername(), HOSTNAME);
            }

            //Generic exception
            throw new Exception("Login error: " + error);
        }
    } catch(NUException ex){
        resetLogin();
        ex.printError();
        accountUIShow().setVisible(true);
    } catch (Exception e) {
        resetLogin();
        NULogger.getLogger().log(Level.SEVERE, "{0}: {1}", new Object[]{getClass().getName(), e});
        showWarningMessage( Translation.T().loginerror(), HOSTNAME);
        accountUIShow().setVisible(true);
    }

}
 
Example 15
Source File: ContentExtractor.java    From WebCollector with GNU General Public License v3.0 4 votes vote down vote up
public static String getContentByHtml(String html, String url) throws Exception {
    Document doc = Jsoup.parse(html, url);
    return getContentElementByDoc(doc).text();
}
 
Example 16
Source File: SendSpace.java    From neembuu-uploader with GNU General Public License v3.0 4 votes vote down vote up
@Override
public void run() {
    try {
        if (sendSpaceAccount.loginsuccessful) {
            userType = "reg";
            httpContext = sendSpaceAccount.getHttpContext();
            maxFileSizeLimit = 314572800; // 300 MB
        } else {
            userType = "anon";
            cookieStore = new BasicCookieStore();
            httpContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore);
            maxFileSizeLimit = 314572800; // 300 MB
        }

        if (file.length() > maxFileSizeLimit) {
            throw new NUMaxFileSizeException(maxFileSizeLimit, file.getName(), host);
        }
        uploadInitialising();
        initialize();
        
        // https://fs08u.sendspace.com/upload?SPEED_LIMIT=0&MAX_FILE_SIZE=314572800&UPLOAD_IDENTIFIER=x.x.x.x.0&DESTINATION_DIR=xx
        // fs08u.sendspace.com/upload?SPEED_LIMIT=0&MAX_FILE_SIZE=314572800&UPLOAD_IDENTIFIER=x.x.x.x.0&DESTINATION_DIR=xx
        hostName = StringUtils.stringStartingFromString(uploadURL, "https://");
        // fs08u.sendspace.com
        hostName = StringUtils.stringUntilString(hostName, "sendspace.com") + "sendspace.com";
        
        // https://fs08u.sendspace.com/upload?SPEED_LIMIT=0&MAX_FILE_SIZE=314572800&UPLOAD_IDENTIFIER=910609187.1440099567.3BB289C9.22.0&DESTINATION_DIR=22
        httpPost = new NUHttpPost(uploadURL);
        httpPost.setHeader("Host", hostName);
        httpPost.setHeader("Referer", "https://www.sendspace.com/");
        
        MultipartEntity mpEntity = new MultipartEntity(HttpMultipartMode.BROWSER_COMPATIBLE);
        mpEntity.addPart("PROGRESS_URL", new StringBody(progressURL));
        mpEntity.addPart("js_enabled", new StringBody("1"));
        mpEntity.addPart("signature", new StringBody(signature));
        mpEntity.addPart("upload_files", new StringBody(""));
        if (userType.equals("reg")) {
            mpEntity.addPart("userid", new StringBody(userID));
            mpEntity.addPart("folder_id", new StringBody("0"));
        }
        mpEntity.addPart("terms", new StringBody("1"));
        mpEntity.addPart("file[]", new StringBody(""));
        mpEntity.addPart("description[]", new StringBody(""));
        mpEntity.addPart("upload_file[]", createMonitoredFileBody());
        httpPost.setEntity(mpEntity);
        
        NULogger.getLogger().log(Level.INFO, "executing request {0}", httpPost.getRequestLine());
        NULogger.getLogger().info("Now uploading your file into SendSpace.com");
        uploading();
        httpResponse = httpclient.execute(httpPost, httpContext);
        responseString = EntityUtils.toString(httpResponse.getEntity());
        
        doc = Jsoup.parse(responseString);
        
        //Read the links
        gettingLink();
        downloadlink = doc.select("div[class=file_description]").select("a").first().attr("href");
        deletelink = doc.select("a[class=link]").attr("href");

        NULogger.getLogger().log(Level.INFO, "Delete link : {0}", deletelink);
        NULogger.getLogger().log(Level.INFO, "Download link : {0}", downloadlink);
        downURL = downloadlink;
        delURL = deletelink;

        uploadFinished();
    } catch(NUException ex){
        ex.printError();
        uploadInvalid();
    } catch (Exception e) {
        Logger.getLogger(getClass().getName()).log(Level.SEVERE, null, e);
        uploadFailed();
    }
}
 
Example 17
Source File: HtmlParserTest.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
@Test public void handlesControlCodeInAttributeName() {
    Document doc = Jsoup.parse("<p><a \06=foo>One</a><a/\06=bar><a foo\06=bar>Two</a></p>");
    assertEquals("<p><a>One</a><a></a><a foo=\"bar\">Two</a></p>", doc.body().html());
}
 
Example 18
Source File: UpLeaAccount.java    From neembuu-uploader with GNU General Public License v3.0 4 votes vote down vote up
@Override
public void login() {
    loginsuccessful = false;
    try {
        initialize();

        NULogger.getLogger().info("Trying to log in to UpLea.com");
        httpPost = new NUHttpPost("http://api.uplea.com/api/get-my-api-key");

        List<NameValuePair> formparams = new ArrayList<NameValuePair>();
        formparams.add(new BasicNameValuePair("username", getUsername()));
        formparams.add(new BasicNameValuePair("password", getPassword()));
        
        UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formparams, "UTF-8");
        httpPost.setEntity(entity);
        httpResponse = httpclient.execute(httpPost, httpContext);
        NULogger.getLogger().info(httpResponse.getStatusLine().toString());
        responseString = EntityUtils.toString(httpResponse.getEntity());
        // {"error":[],"result":{"api_key":"alphanumericstring"},"status":true}
        api_key = StringUtils.stringBetweenTwoStrings(responseString, "\"api_key\":\"", "\"");

        if (responseString.contains("true") && !api_key.isEmpty()) {
            EntityUtils.consume(httpResponse.getEntity());
            loginsuccessful = true;
            username = getUsername();
            password = getPassword();
            NULogger.getLogger().info("UpLea.com login successful!");

        } else {
            //Get error message
            responseString = EntityUtils.toString(httpResponse.getEntity());
            Document doc = Jsoup.parse(responseString);
            String error = doc.select(".err").first().text();
            
            if("Incorrect Login or Password".equals(error)){
                throw new NUInvalidLoginException(getUsername(), HOSTNAME);
            }

            //Generic exception
            throw new Exception("Login error: " + error);
        }
    } catch(NUException ex){
        resetLogin();
        ex.printError();
        accountUIShow().setVisible(true);
    } catch (Exception e) {
        resetLogin();
        NULogger.getLogger().log(Level.SEVERE, "{0}: {1}", new Object[]{getClass().getName(), e});
        showWarningMessage( Translation.T().loginerror(), HOSTNAME);
        accountUIShow().setVisible(true);
    }
}
 
Example 19
Source File: HtmlParserTest.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
@Test public void handlesNestedImplicitTable() {
    Document doc = Jsoup.parse("<table><td>1</td></tr> <td>2</td></tr> <td> <table><td>3</td> <td>4</td></table> <tr><td>5</table>");
    assertEquals("<table><tbody><tr><td>1</td></tr> <tr><td>2</td></tr> <tr><td> <table><tbody><tr><td>3</td> <td>4</td></tr></tbody></table> </td></tr><tr><td>5</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
}
 
Example 20
Source File: PolymerServerEventHandlersTest.java    From flow with Apache License 2.0 4 votes vote down vote up
CorrectAnnotationUsage() {
    super((clazz, tag, service) -> new TemplateData("",
            Jsoup.parse("<dom-module id='polymer'></dom-module>")));
}