Java Code Examples for org.jsoup.nodes.Document

The following are top voted examples for showing how to use org.jsoup.nodes.Document. These examples are extracted from open source projects. You can vote up the examples you like and your votes will be used in our system to generate more good examples.
Example 1
Project: WebtoonDownloadManager   File: ManualController.java   View source code 20 votes vote down vote up
/**
 * 웹툰조회
 */
public void getWebtoon(String code) {

	if (!"".equals(code)) {
		CommonService cs = new CommonService();

		Connection conn = cs.getConnection(code);
		conn.timeout(5000);

		Document doc = null;
		
		codeInputField.setText(code);
		wDesc.setWrapText(true);

		try {

			doc = conn.get();

			String title = doc.select("title").text().split("::")[0];
			setTitle(title);

			String author = doc.select("div.detail h2 > span").text();
			wTitle.setText(title + "(" + author + ")");

			String desc = doc.select("div.detail p").text();
			wDesc.setText(desc);

			String img = doc.select("div.thumb > a img").attr("src");
			thumbnail.setImage(new Image(img, true));

		} catch (Exception e) {
			e.printStackTrace();
		}
	} else {
		Platform.runLater(new Runnable() {
			@Override
			public void run() {
				AlertSupport alert = new AlertSupport("웹툰코드를 입력하세요.");
				alert.alertInfoMsg(stage);
			}
		});
	}
}
 
Example 2
Project: android-apps   File: JsoupUtil.java   View source code 10 votes vote down vote up
private static Map meiyuxsCatalog(Map map, String url) {
  try {
    List data = new ArrayList();
    Document document = Jsoup
        .connect(url)
        .userAgent(FormatUtil.USER_AGENT_PC)
        .get();
    Element body = document.body();
    Elements catalogEles = body.getElementsByClass("list-group-item");
    for (Element catalogE : catalogEles) {
      if (catalogE.getElementsByTag("a").size() > 0) {
        Map<String, Object> _map = new HashMap<>();
        _map.put("catalog", catalogE.text());
        _map.put("href", "http://www.meiyuxs.com" + catalogE.getElementsByTag("a").first().attr("href"));
        data.add(_map);
      }
    }
    map.put("data", data);
    map.put("cover", "");
    map.put("lastChapter", ((Map) data.get(data.size() - 1)).get("catalog").toString());
  } catch (IOException e) {
    e.printStackTrace();
  }
  return map;
}
 
Example 3
Project: PicKing   File: AKabe.java   View source code 10 votes vote down vote up
@Override
public Map<DetailActivity.parameter, Object> getDetailContent(String baseUrl, String currentUrl, byte[] result, Map<DetailActivity.parameter, Object> resultMap) throws UnsupportedEncodingException {
    List<PicInfo> urls = new ArrayList<>();
    Document document = Jsoup.parse(new String(result, "utf-8"));

    String sTitle = "";
    Elements title = document.select("#header h1");
    if (title.size() > 0)
        sTitle = title.get(0).text();

    List<String> tagList = new ArrayList<>();
    Elements tags = document.select("ul.tagList a");
    if (tags.size() > 0)
        for (Element tag : tags)
            tagList.add(tag.text());

    Elements elements = document.select("ul.gallery li:has(img)");
    for (Element element : elements) {
        urls.add(new PicInfo(element.attr("data-src")).setTitle(sTitle).setTags(tagList));
    }

    resultMap.put(DetailActivity.parameter.CURRENT_URL, currentUrl);
    resultMap.put(DetailActivity.parameter.RESULT, urls);
    return resultMap;
}
 
Example 4
Project: desktop   File: Main.java   View source code 9 votes vote down vote up
public void showWord() {
    try {
        String language;
        Languages l;
        l = (Languages) cmbLanguage.getSelectedItem();
        language = l.getLang();
        Document doc = Jsoup.connect("http://evilinsult.com/generate_insult.php?lang=" + language).get();
        Elements links = doc.select("body");
        for (Element link : links) {
            txtPaneShow.setText("\n" + link.text());
        }
    } catch (RuntimeException e) {
        throw e;
    } catch (Exception ex) {
        txtPaneShow.setText("\n" + "Insult Outage! Please Check Your Internet Connection And Try Again In Three Minutes");
    }
}
 
Example 5
Project: PartyBuildingStudies   File: NewsPresenter.java   View source code 8 votes vote down vote up
@Override
public void onHandleParseHTML(final String url) {
    mView.showLoading(true);
    Observable.create(new ObservableOnSubscribe<ArrayList<ArticleItem>>() {
        @Override
        public void subscribe(ObservableEmitter<ArrayList<ArticleItem>> e) throws Exception {
            ArrayList<ArticleItem> list = new ArrayList<>();
            Document doc = Jsoup.connect(url).get();
            Elements ul = doc.getElementsByClass("list_line");
            for (Element u : ul) {
                Elements li = u.getElementsByTag("li");
                for (Element l : li) {
                    String text = l.getElementsByTag("a").text();
                    String href = l.getElementsByTag("a").attr("href");
                    String time = l.getElementsByTag("span").text();
                    list.add(new ArticleItem(text, href, time));
                }
            }
            e.onNext(list);
        }
    })
            .subscribeOn(Schedulers.io())
            .observeOn(AndroidSchedulers.mainThread())
            .subscribe(new Consumer<ArrayList<ArticleItem>>() {
                @Override
                public void accept(@NonNull ArrayList<ArticleItem> articleItems) throws Exception {
                    mView.showList(articleItems);
                    mView.showLoading(false);
                }
            });
}
 
Example 6
Project: NewKakaoBot   File: ScriptUtil.java   View source code 6 votes vote down vote up
@JSStaticFunction
public static void parseToText(final String url, final String option, final Function func) throws IOException {
    new Thread(new Runnable() {
        @Override
        public void run() {
            Document document = null;
            try {
                document = Jsoup.connect(url).get();
                Elements element = document.select(option);

                func.call(context, scope, scope, new Object[] { element.text(), null });
            } catch (IOException e) {
                try {
                    func.call(context, scope, scope, new Object[] { null, e });
                } catch (Exception err) {}
            }
        }
    }).start();
}
 
Example 7
Project: ripme   File: XhamsterRipper.java   View source code 6 votes vote down vote up
@Override
public List<String> getURLsFromPage(Document doc) {
    List<String> result = new ArrayList<>();
    for (Element thumb : doc.select("div.picture_view > div.pictures_block > div.items > div.item-container > a > div.thumb_container > div.img > img")) {
        String image = thumb.attr("src");
        // replace thumbnail urls with the urls to the full sized images
        image = image.replaceAll(
                "https://upt.xhcdn\\.",
                "http://up.xhamster.");
        image = image.replaceAll("ept\\.xhcdn", "ep.xhamster");
        image = image.replaceAll(
                "_160\\.",
                "_1000.");
        // Xhamster has bad cert management and uses invalid certs for some cdns, so we change all our requests to http
        image = image.replaceAll("https", "http");
        result.add(image);
    }
    return result;
}
 
Example 8
Project: crawler-jsoup-maven   File: JsoupUtil.java   View source code 6 votes vote down vote up
/**
 * 方法说明:绑定单cookie模拟浏览器,返回document对象
 * 
 * @param url           被访问url
 * @param cookieKey     绑定cookie的key
 * @param cookieValue   绑定cookie的value
 * @return Document     返回document对象
 * @throws Exception
 */
public static Document getDocumentWithCookie(String url, String cookieKey, String cookieValue) throws Exception {

    Document doc = null;

    if (StringUtil.isEmpty(cookieKey) && StringUtil.isEmpty(cookieValue)) {
        doc = getDocument(url);
    } else if (!StringUtil.isEmpty(cookieKey) && !StringUtil.isEmpty(cookieValue)){
        Map<String, String> cookiesMap = new HashMap<String, String>();
        cookiesMap.put(cookieKey, cookieValue);
        doc = getDocumentWithCookies(url, cookiesMap);
    } else {
        // parameter is error. 参数が不正である、所传参数错误。
        throw new IllegalArgumentException("key or value is err"); // TODO hard coding is fixing bluetata 2017/03/20 add
    }
    return doc;
}
 
Example 9
Project: crawling-framework   File: DateParser.java   View source code 6 votes vote down vote up
public static List<MatchedDate> extractFromProperties(Document document) {
    List<MatchedDate> result = Lists.newArrayList();

    for (String selector : ITEMPROP_SELECTORS) {
        document.select(selector).forEach(m -> {
            String datetime = m.attr("datetime");
            String content = m.attr("content");
            String title = m.attr("title");
            if (!Strings.isNullOrEmpty(datetime)) {
                result.add(new MatchedDate(datetime, selector));
            } else if (!Strings.isNullOrEmpty(content)) {
                result.add(new MatchedDate(content, selector));
            } else if (!Strings.isNullOrEmpty(title)) {
                result.add(new MatchedDate(title, selector));
            }
        });
    }

    return result;
}
 
Example 10
Project: PicKing   File: XiuMM.java   View source code 6 votes vote down vote up
@Override
public Map<ContentsActivity.parameter, Object> getContent(String baseUrl, String currentUrl, byte[] result, Map<ContentsActivity.parameter, Object> resultMap) throws UnsupportedEncodingException {
    List<AlbumInfo> urls = new ArrayList<>();
    Document document = Jsoup.parse(new String(result, "utf-8"));
    Elements elements = document.select("div.album");
    for (Element element : elements) {
        AlbumInfo temp = new AlbumInfo();

        Elements title = element.select("span.name");
        if (title.size() > 0)
            temp.setTitle(title.get(0).text());

        Elements album = element.select(".pic_box a");
        temp.setAlbumUrl(album.attr("href"));
        Elements pic = album.select("img");
        if (pic.size() > 0)
            temp.setPicUrl(pic.get(0).attr("src"));
        urls.add(temp);
    }
    resultMap.put(ContentsActivity.parameter.CURRENT_URL, currentUrl);
    resultMap.put(ContentsActivity.parameter.RESULT, urls);
    return resultMap;
}
 
Example 11
Project: Linda-AI   File: Act.java   View source code 6 votes vote down vote up
static void Wikipedia(String dico) {
    Document significatowikipedia = null;
    String cercowikipedia = dico.substring((dico.indexOf("'")) + 1, (dico.lastIndexOf("'")));
    try {
        significatowikipedia = Jsoup.connect("https://it.wikipedia.org/wiki/" + cercowikipedia.replace(" ", "_")).userAgent("Mozilla").get();
        String divs = significatowikipedia.select("p").text();
        if (!divs.equals("")) {
            new GUI().giveResponse("La ricerca di " + cercowikipedia + " su wikipedia ha restituito il seguente risultato:" + '\n' + divs);
        } else {
            new GUI().giveResponse("Mi dispiace, non ho trovato informazioni su " + cercowikipedia + " su Wikipedia...");
        }
    } catch (HttpStatusException e) {
        new GUI().giveResponse("Mi dispiace, Wikipedia sembra non avere una voce per '" + cercowikipedia +"'...");
    } catch (java.io.IOException f) {
        f.printStackTrace();
    } catch (StringIndexOutOfBoundsException g) {
        new GUI().giveResponse("Ricorda che, perché io cerchi informazioni riguardo a qualcosa, occorre che tu la definisca fra due virgolette!");
    }
}
 
Example 12
Project: catpeds   File: PawpedsDocumentParserTest.java   View source code 6 votes vote down vote up
/**
 * Test that {@link PawpedsDocumentParser#parseSearch(Document)} throws an
 * {@link IllegalArgumentException} if there is an jsoup parsing error.
 */
@Test(expected = IllegalArgumentException.class)
public void testJsoupSelectorUnexpectedError() throws Exception {
	// Given
	Document document = mock(Document.class);

	Elements noErrorElement = mock(Elements.class);
	when(noErrorElement.text()).thenReturn("");
	when(document.select("th.error")).thenReturn(noErrorElement);

	when(document.select("table.searchresult tr.searchresult:has(td.searchresult)")).thenThrow(SelectorParseException.class);

	// When
	pawpedsDocumentParser.parseSearch(document);

	// Then
	// the exception is expected
}
 
Example 13
Project: ConnectU   File: AnunciosRequest.java   View source code 6 votes vote down vote up
public void loadAnuncios(final AnunciosCallback callback) {
    UAWebService.HttpWebGetRequest(context, ANUNCIOS_URL, new UAWebService.WebCallBack() {
        @Override
        public void onNavigationComplete(boolean isSuccessful, String body) {
            if (isSuccessful) {
                Document doc = Jsoup.parse(body);
                //Get Post data
                Element anuncios = doc.select(ANUNCIOS_LIST_BODY).first();
                try {
                    for (Element anuncio : anuncios.children()) {
                        parseAnuncio(anuncio, "");
                    }
                    callback.onResult(true, "");
                } catch (NullPointerException e) {
                    FirebaseCrash.log(body);
                    FirebaseCrash.report(e);
                    callback.onResult(false, ErrorManager.LOGIN_REJECTED); //Usually because session ended!
                }
            } else {
                callback.onResult(false, body);
            }
        }
    });
}
 
Example 14
Project: JsoupSample   File: ImageListPresenterImpl.java   View source code 6 votes vote down vote up
@Override
public List<ImageModel> getT(Document document) {
    if (view == null) {
        return new ArrayList<>();
    }
    switch (view.getType()) {
        case ApiConfig.Type.DOU_BAN_MEI_ZI:
            return JsoupDoubanManager.get(document).getImageList();
        case ApiConfig.Type.KK:
            return JsoupKKManager.get(document).getImageList();
        case ApiConfig.Type.M_ZI_TU:
            return JsoupMZiTuManager.get(document).getImageList();
        case ApiConfig.Type.MM:
            return JsoupMMManager.get(document).getImageList();
        case ApiConfig.Type.MEIZITU:
            return JsoupMeiZiTuManager.get(document).getImageList();
        default:
            return new ArrayList<>();
    }
}
 
Example 15
Project: ripme   File: DeviantartRipper.java   View source code 6 votes vote down vote up
@Override
public List<String> getDescriptionsFromPage(Document page) {
    List<String> textURLs = new ArrayList<>();
    // Iterate over all thumbnails
    for (Element thumb : page.select("div.zones-container span.thumb")) {
        logger.info(thumb.attr("href"));
        if (isStopped()) {
            break;
        }
        Element img = thumb.select("img").get(0);
        if (img.attr("transparent").equals("false")) {
            continue; // a.thumbs to other albums are invisible
        }
        textURLs.add(thumb.attr("href"));

    }
    return textURLs;
}
 
Example 16
Project: ripme   File: FlickrRipper.java   View source code 6 votes vote down vote up
/**
 * Login to Flickr.
 * @return Cookies for logged-in session
 * @throws IOException
 */
@SuppressWarnings("unused")
private Map<String,String> signinToFlickr() throws IOException {
    Response resp = Jsoup.connect("http://www.flickr.com/signin/")
                        .userAgent(USER_AGENT)
                        .followRedirects(true)
                        .method(Method.GET)
                        .execute();
    Document doc = resp.parse();
    Map<String,String> postData = new HashMap<>();
    for (Element input : doc.select("input[type=hidden]")) {
        postData.put(input.attr("name"),  input.attr("value"));
    }
    postData.put("passwd_raw",  "");
    postData.put(".save",   "");
    postData.put("login",   new String(Base64.decode("bGVmYWtlZGVmYWtl")));
    postData.put("passwd",  new String(Base64.decode("MUZha2V5ZmFrZQ==")));
    String action = doc.select("form[method=post]").get(0).attr("action");
    resp = Jsoup.connect(action)
                .cookies(resp.cookies())
                .data(postData)
                .method(Method.POST)
                .execute();
    return resp.cookies();
}
 
Example 17
Project: PicKing   File: Mntu92.java   View source code 6 votes vote down vote up
@Override
public Map<DetailActivity.parameter, Object> getDetailContent(String baseUrl, String currentUrl, byte[] result, Map<DetailActivity.parameter, Object> resultMap) throws UnsupportedEncodingException {
    List<PicInfo> urls = new ArrayList<>();
    Document document = Jsoup.parse(new String(result, "utf-8"));
    PicInfo picInfo = new PicInfo();
    Elements elements = document.select("#bigpic img");
    for (Element element : elements) {
        picInfo.setPicUrl(baseUrl + element.attr("src"));
    }
    Elements title = document.select("#entry h1");
    if (title.size() > 0)
        picInfo.setTitle(title.text());
    Elements tags = document.select(".postinfo a");
    if (tags.size() > 0) {
        List<String> tagList = new ArrayList<>();
        for (Element t : tags)
            tagList.add(t.text());
        picInfo.setTags(tagList);
    }
    urls.add(picInfo);

    resultMap.put(DetailActivity.parameter.CURRENT_URL, currentUrl);
    resultMap.put(DetailActivity.parameter.RESULT, urls);
    return resultMap;
}
 
Example 18
Project: android-apps   File: JsoupUtil.java   View source code 6 votes vote down vote up
private static Map qulaCatalog(Map map, String url) {
  try {
    List data = new ArrayList();
    Document document = Jsoup
        .connect(url)
        .userAgent(FormatUtil.USER_AGENT_PC)
        .get();
    Element body = document.body();
    Elements catalogEles = body.getElementById("list").getElementsByTag("dd");
    for (Element catalogE : catalogEles) {
      if (catalogE.getElementsByTag("a").size() > 0) {
        Map<String, Object> _map = new HashMap<>();
        _map.put("catalog", catalogE.text());
        _map.put("href", url + catalogE.getElementsByTag("a").first().attr("href"));
        data.add(_map);
      }
    }
    String cover = body.getElementById("fmimg").getElementsByTag("img").first().attr("src");
    map.put("data", data);
    map.put("cover", cover);
    map.put("lastChapter", ((Map) data.get(data.size() - 1)).get("catalog").toString());
  } catch (IOException e) {
    e.printStackTrace();
  }
  return map;
}
 
Example 19
Project: wulkanowy   File: StudentAndParentTestCase.java   View source code 6 votes vote down vote up
protected StudentAndParent getSnp(String fixtureFileName) throws Exception {
    String input = FixtureHelper.getAsString(getClass().getResourceAsStream(fixtureFileName));

    Document tablePageDocument = Jsoup.parse(input);

    StudentAndParent snp = Mockito.mock(StudentAndParent.class);
    Mockito.when(snp.getSnPPageDocument(Mockito.anyString()))
            .thenReturn(tablePageDocument);
    Mockito.when(snp.getSemesters(Mockito.any(Document.class))).thenCallRealMethod();
    Mockito.when(snp.getCurrentSemester(Mockito.<Semester>anyList()))
            .thenCallRealMethod();
    Mockito.when(snp.getRowDataChildValue(Mockito.any(Element.class),
            Mockito.anyInt())).thenCallRealMethod();

    return snp;
}
 
Example 20
Project: fantalegheEV   File: RemoteController.java   View source code 6 votes vote down vote up
private Map<Integer, List<TeamResult>> getResults(String leagueName) throws IOException{
    String url = GlobalConfiguration.baseURL + leagueName + GlobalConfiguration.calendarSuffix;
    Document doc = Jsoup.connect(url).get();
    Map<Integer, List<TeamResult>> results = new HashMap<>();
    Elements calendarDays = doc.select(".table");
    Iterator it = calendarDays.iterator();
    int day = 1;
    while (it.hasNext()) {
        Element calendarDay = (Element) it.next();
        if (calendarDay.children().is(".greyfoot")) {
            break;
        }
        List<TeamResult> teamResults = new ArrayList<>();
        for (Element match : calendarDay.select(".match")) {
            teamResults.add(new TeamResult(match.children().get(0).text(), Double.parseDouble(match.children().get(1).text().replace(",", "."))));
            teamResults.add(new TeamResult(match.children().get(3).text(), Double.parseDouble(match.children().get(2).text().replace(",", "."))));
        }
        results.put(day, teamResults);
        day++;
    }
    return results;
}
 
Example 21
Project: ovh-java-sdk   File: ApiOvhUtils.java   View source code 6 votes vote down vote up
/**
 * Create a new OVH Application using https://eu.api.ovh.com/createApp/
 * Outout the Application Key and Application Secret in std-out
 * @param nic
 * @param password
 * @throws IOException
 */
public void createApplication(String nic, String password) throws IOException {
	String url = "https://eu.api.ovh.com/createApp/";
	Document doc = Jsoup.connect(url)
			.data("nic", nic)
			.data("password", password)
			.data("applicationName", "One Shoot Token")
			.data("applicationDescription", "One Shoot Token")
			.post();
	String body = doc.toString();
	Pattern extract = Pattern.compile(" Application (\\w+)<pre><name>([^<]+)</name></pre>");
	Matcher m = extract.matcher(body);
	String Key = null;
	String Secret = null;
	while (m.find()) {
		String k = m.group(1);
		String v = m.group(2);
		if (k.equals("Key"))
			Key = v;
		if (k.equals("Secret"))
			Secret = v;
	}
	log.warn("Key:{} Secret:{}", Key, Secret);
}
 
Example 22
Project: crawler-jsoup-maven   File: JsoupUtil.java   View source code 6 votes vote down vote up
/**
 * 方法说明:绑定单data(parameter)模拟浏览器,并返回document对象
 * 
 * @param url           被访问的url
 * @param dataKey       parameter的key
 * @param dataValue     parameter的value
 * @return Document     返回document对象
 * @throws Exception
 */
public static Document getDocumentWithData(String url, String dataKey, String dataValue) throws Exception {

    Document doc = null;

    if (StringUtil.isEmpty(dataKey) && StringUtil.isEmpty(dataValue)) {
        doc = getDocument(url);
    } else if (!StringUtil.isEmpty(dataKey) && !StringUtil.isEmpty(dataValue)){
        Map<String, String> dataMap = new HashMap<String, String>();
        dataMap.put(dataKey, dataValue);
        doc = getDocumentWithData(url, dataMap);
    } else {
        // parameter is error. 参数が不正である、所传参数错误。
        throw new IllegalArgumentException("key or value is err"); // TODO hard coding is fixing bluetata 2017/03/20 add
    }
    return doc;
}
 
Example 23
Project: ripme   File: FlickrRipper.java   View source code 6 votes vote down vote up
private Document getLargestImagePageDocument(URL url) throws IOException {
    // Get current page
    Document doc = Http.url(url).get();
    // Look for larger image page
    String largestImagePage = this.url.toExternalForm();
    for (Element olSize : doc.select("ol.sizes-list > li > ol > li")) {
        Elements ola = olSize.select("a");
        if (ola.size() == 0) {
            largestImagePage = this.url.toExternalForm();
        }
        else {
            String candImage = ola.get(0).attr("href");
            if (candImage.startsWith("/")) {
                candImage = "http://www.flickr.com" + candImage;
            }
            largestImagePage = candImage;
        }
    }
    if (!largestImagePage.equals(this.url.toExternalForm())) {
        // Found larger image page, get it.
        doc = Http.url(largestImagePage).get();
    }
    return doc;
}
 
Example 24
Project: ripme   File: FlickrRipper.java   View source code 6 votes vote down vote up
@Override
public void run() {
    try {
        Document doc = getLargestImagePageDocument(this.url);
        Elements fullsizeImages = doc.select("div#allsizes-photo img");
        if (fullsizeImages.size() == 0) {
            logger.error("Could not find flickr image at " + doc.location() + " - missing 'div#allsizes-photo img'");
        }
        else {
            String prefix = "";
            if (Utils.getConfigBoolean("download.save_order", true)) {
                prefix = String.format("%03d_", index);
            }
            synchronized (flickrThreadPool) {
                addURLToDownload(new URL(fullsizeImages.first().attr("src")), prefix);
            }
        }
    } catch (IOException e) {
        logger.error("[!] Exception while loading/parsing " + this.url, e);
    }
}
 
Example 25
Project: ChineseStarsRelationship   File: Spider.java   View source code 6 votes vote down vote up
private static void downloadSummary(String name, Document doc) {
	
	// 写文件
	try {
		String summary = doc.select(".summary p").first().text();
		logger.debug(summary);
		FileUtils.write(new File(rootPath + "Summary.data"), name + "\n" + summary + "\n", "utf-8", true);
	} catch (Exception e) {
		logger.error("个人描述信息 写入:【" + name + "】\t失败!");
		try {
			FileUtils.write(new File(rootPath + "ErrorSummary.data"), name + "\t" + e.toString() + "\n", "utf-8", true);
		} catch (IOException e1) {
			e1.printStackTrace();
		}
	}
}
 
Example 26
Project: InComb   File: RssItem.java   View source code 6 votes vote down vote up
/**
 * Tries to parse the image url out of the description. If it fails <code>null</code> will be returned.
 * @return Image url in the description. <code>Null</code> if no description was found.
 */
private String parseImage() {
	final Document doc = Jsoup.parse(getDescription());
	final Elements imgs = doc.getElementsByTag("img");
	for (final Element img : imgs) {
		String src = img.attr("src");
		if(StringUtils.isNotBlank(src)) {

			if(src.startsWith("/")) {
				try {
					final URL feedUrl = new URL(document.getContentSource().getUrl());
					src = "//" + feedUrl.getHost() + src;
				} catch (final MalformedURLException e) {
					// next
					continue;
				}
			}

			return src;
		}
	}

	return null;
}
 
Example 27
Project: ripme   File: ImagebamRipper.java   View source code 6 votes vote down vote up
private void fetchImage() {
    try {
        Document doc = Http.url(url).get();
        // Find image
        Elements images = doc.select(".image-container img");
        if (images.size() == 0) {
            logger.warn("Image not found at " + this.url);
            return;
        }
        Element image = images.first();
        String imgsrc = image.attr("src");
        logger.info("Found URL " + imgsrc);
        // Provide prefix and let the AbstractRipper "guess" the filename
        String prefix = "";
        if (Utils.getConfigBoolean("download.save_order", true)) {
            prefix = String.format("%03d_", index);
        }
        addURLToDownload(new URL(imgsrc), prefix);
    } catch (IOException e) {
        logger.error("[!] Exception while loading/parsing " + this.url, e);
    }
}
 
Example 28
Project: nbaScorePull   File: ExampleUnitTest.java   View source code 5 votes vote down vote up
@Test
public void team_name_isValid() throws Exception {
    String url = "https://www.basketball-reference.com/boxscores/201706120GSW.html";
    Document doc = Jsoup.connect(url).get();
    Element line_score = doc.getElementById("div_line_score");
    String away_team_name = line_score.getElementsByTag("tr").get(1).getElementsByTag("td").get(1).text();
    assertEquals(away_team_name, "CLE");
}
 
Example 29
Project: nifi-nars   File: GetWebpage.java   View source code 5 votes vote down vote up
/**
 * Uses Jsoup to convert from HTML to XHTML
 */
private byte[] formatToXHtml(String html, Charset charset) {
    Document document = Jsoup.parseBodyFragment(html);
    document.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
    document.outputSettings().charset(charset);
    return document.toString().getBytes(charset);
}
 
Example 30
Project: ripme   File: BcfakesRipper.java   View source code 5 votes vote down vote up
@Override
public List<String> getURLsFromPage(Document doc) {
    List<String> imageURLs = new ArrayList<>();
    for (Element thumb : doc.select("div.ngg-gallery-thumbnail > a > img")) {
        String imageURL = thumb.attr("src");
        imageURL = imageURL.replace("thumbs/thumbs_", "");
        imageURLs.add(imageURL);
    }
    return imageURLs;
}
 
Example 31
Project: nixmash-blog   File: JsoupServiceImpl.java   View source code 5 votes vote down vote up
private Document getDocument(String url, Boolean validateCert)
        throws IOException {
    return Jsoup.connect(url)
            .userAgent(userAgent)
            .timeout(12000)
            .referrer("http://www.google.com")
            .followRedirects(true)
            .ignoreHttpErrors(true)
            .ignoreContentType(true)
            .validateTLSCertificates(validateCert)
            .get();
}
 
Example 32
Project: jsouplib   File: AJsoupResponseBodyConverter.java   View source code 5 votes vote down vote up
@Override
public T convert(ResponseBody value) throws IOException {
    Document parse = Jsoup.parse(value.string());
    try {
        if (mT == Document.class) {
            return (T) parse;
        }
        return AJsoupReader.deserialize(parse, (Class<T>) mT);
    } finally {
        value.close();
    }
}
 
Example 33
Project: vscrawler   File: XpathNode.java   View source code 5 votes vote down vote up
@Override
public SipNodes createOrGetModel() {
    if (model == null) {
        try {
            Document document = Jsoup.parse(getRawText(), getBaseUrl());
            if (document == null) {
                throw new RuntimeException();
            }
            model = new SipNodes(SIPNode.e(document));
        } catch (Exception e) {
            model = new SipNodes(SIPNode.t(getRawText()));
        }
    }
    return model;
}
 
Example 34
Project: ripme   File: PorncomixRipper.java   View source code 5 votes vote down vote up
@Override
public List<String> getURLsFromPage(Document doc) {
    List<String> result = new ArrayList<>();
        for (Element el : doc.select("div.single-post > div.gallery > dl > dt > a > img")) {
            String imageSource = el.attr("data-lazy-src");
            // We remove the .md from images so we download the full size image
            // not the thumbnail ones
                imageSource = imageSource.replaceAll("-\\d\\d\\dx\\d\\d\\d", "");
                result.add(imageSource);
            }
        return result;
}
 
Example 35
Project: MS-Cyber-Security-Solutions   File: DataReceived.java   View source code 5 votes vote down vote up
public Integer getData(String link,String searchWord)
	{
		int count = 0;
		try
		{
			Trie myTrie = new Trie();
			
			//Using the jsoup to read through each webpage.
			Document document = Jsoup.connect(link).get();
			
			//Reading only the paragraph tags from the specified link.
			Elements paragraph = document.select("p");
			TreeMap<String, Integer> frequencyData = new TreeMap<String, Integer>();
			
			//Looping through all the paragraphs
			for(Element para : paragraph)
			{
				String p = para.text();
				p = p.replaceAll("[,.!?:;()-]", "\\s");//removing all the punctuations and replacing with blank spaces.
				//for each of the above paragraph extract individual words and insert them in trie data structure.
	
				for (String word : p.split(" ")) {
					if (StopWord.is(word.toLowerCase())) continue;
					myTrie.insert(word.toLowerCase());
					count = PageRanking.getCount(word, frequencyData) + 1 ;
					frequencyData.put(word, count);
				}
			}
			int temp = PageRanking.getCount(searchWord, frequencyData);
			count = temp;
//			System.out.println(searchWord+" found in "+ link + " "+myTrie.search(searchWord));
//			System.out.println(searchWord+" occurred "+temp+" times ");
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
		return count;
	}
 
Example 36
Project: crawling-framework   File: ArticleExtractor.java   View source code 5 votes vote down vote up
public static HttpArticleParseResult extractArticleWithDetails(String html, String url, HttpSource source, String publishedHint) {
    Document document = Jsoup.parse(html, url);
    HttpArticleParseResult result = new HttpArticleParseResult();
    HttpArticle article = new HttpArticle();
    article.setUrl(url);
    article.setSource(source.getUrl());
    article.setAppIds(source.getAppIds());
    article.setCategories(source.getCategories());

    List<String> ldJsons = JsonLdParser.extractJsonLdParts(document);
    JsonLdParser.JsonLdArticle ldJsonArticle = JsonLdParser.parse(ldJsons);

    List<MatchedString> titles = extractTitlesWithJsoup(document, ldJsonArticle, source);
    article.setTitle(titles.stream().map(MatchedString::getValue).collect(Collectors.joining("\n")));
    result.setTitleMatches(titles.stream().map(MatchedString::getMatch).collect(Collectors.toList()));

    List<MatchedString> texts = extractTextsWithJsoup(document, source);
    article.setText(texts.stream()
            .map(MatchedString::getValue)
            .map(t -> TextFilters.normalizeText(t, source.getTextNormalizers()))
            .collect(Collectors.joining("\n")));
    result.setTextMatches(texts.stream().map(MatchedString::getMatch).distinct().collect(Collectors.toList()));

    List<MatchedDate> publicationDates = extractPublicationDates(html, document, ldJsonArticle, source, publishedHint);
    MatchedDate publicationDate = publicationDates.stream().filter(d -> d.getDate() != null).findFirst().orElse(null);
    article.setPublished(publicationDate != null ? publicationDate.getDate() : null);
    result.setPublishedPattern(publicationDate != null ? publicationDate.getPattern() : null);
    List<String> publishedTexts = publicationDate != null ?
            Lists.newArrayList(publicationDate.getValue()) : publicationDates.stream().map(MatchedDate::getValue).collect(Collectors.toList());
    result.setPublishedTexts(publishedTexts);
    List<String> publishedMatches = publicationDate != null ?
            Lists.newArrayList(publicationDate.getMatch()) : publicationDates.stream().map(MatchedDate::getMatch).collect(Collectors.toList());

    result.setPublishedMatches(publishedMatches);

    result.setArticle(article);
    return result;
}
 
Example 37
Project: PicKing   File: Aitaotu.java   View source code 5 votes vote down vote up
@Override
public String getContentNext(String baseUrl, String currentUrl, byte[] result) throws UnsupportedEncodingException {
    Document document = Jsoup.parse(new String(result, "utf-8"));
    Elements elements = document.select("#pageNum a:containsOwn(下一页)");
    if (elements.size() > 0)
        return baseUrl + elements.get(0).attr("href");
    return "";
}
 
Example 38
Project: AlipayOrdersSupervisor-GUI   File: ApsvTimerTask.java   View source code 5 votes vote down vote up
private ArrayList<ApsvOrder> findOrders(String html) {
    //logger.info("Html: {}", html);
    ArrayList<ApsvOrder> orders = new ArrayList<>();

    Document doc = Jsoup.parse(html);

    Element ordersForm = doc.getElementById("J-submit-form");
    if (ordersForm == null) {
        logger.error("Cannot find order list form, maybe cookie expires");
        // 标记task status为异常
        // TODO 弹窗提醒cookie异常
        RunTasksModel.getInstance().MarkTaskException(task.id);
        return orders;
    }

    Elements tableBody = doc.select("#tradeRecordsIndex>tbody");
    Elements orderRows = tableBody.select("tr");

    orderRows.forEach(row -> {
        Elements timeNodes = row.select("td.time p");
        String[] orderNoData = row.select("td.tradeNo p").text().split("\\|");
        ApsvOrder order = new ApsvOrder(){
            {
                taskId = task.id;
                time = timeNodes.get(0).text() + " " + timeNodes.get(timeNodes.size() - 1).text();
                description = row.select(".memo-info").text();
                memo = row.select("td.memo p").text();
                tradeNo = orderNoData.length > 1 ? orderNoData[1].split(":")[1] : orderNoData[0].split(":")[1];
                username = Unicode.unicodeToString(row.select("td.other p").text());
                amount = Float.parseFloat(row.select("td.amount span").text().replaceAll("\\s+", ""));
                status = row.select("td.status p").text();
            }
        };
        order.sig = Order.Sign(order, task.pushSecret);
        orders.add(order);
    });
    return orders;
}
 
Example 39
Project: NTPaprEng   File: PaperWebPage.java   View source code 5 votes vote down vote up
private int parseVolum(final Document dom) {

        final Elements volum = dom.select(VOLUM_CSS_SELECTOR);

        try {
            return Integer.parseInt(volum.text().substring(VOLUM_TEXT_OFFSET));
        } catch (Exception e) {
            return 0;
        }
    }
 
Example 40
Project: ripme   File: FuraffinityRipper.java   View source code 5 votes vote down vote up
@Override
public List<String> getURLsFromPage(Document page) {
    List<String> urls = new ArrayList<>();
    Elements urlElements = page.select("figure.t-image > b > u > a");
    for (Element e : urlElements) {
        urls.add(getImageFromPost(urlBase + e.select("a").first().attr("href")));
    }
    return urls;
}