Java Code Examples for org.jsoup.nodes.Document#text()

The following examples show how to use org.jsoup.nodes.Document#text() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HtmlHelper.java    From FairEmail with GNU General Public License v3.0 6 votes vote down vote up
private static String _getText(Document d, boolean full) {
    truncate(d, !full);

    for (Element bq : d.select("blockquote")) {
        bq.prependChild(new TextNode("["));
        bq.appendChild(new TextNode("]"));
    }

    String text = d.text();
    if (full)
        return text;

    String preview = text.substring(0, Math.min(text.length(), PREVIEW_SIZE));
    if (preview.length() < text.length())
        preview += "…";

    return preview;
}
 
Example 2
Source File: JerryExtractor.java    From web-data-extractor with Apache License 2.0 6 votes vote down vote up
private String parse(String str) {
    Document document = Jsoup.parse(str, "", Parser.xmlParser());
    String result = "";
    switch (outType) {
        case TYPE_TEXT:
            result = document.text();
            break;
        case TYPE_HTML:
            result = document.html();
            break;
        default:
            result = document.text();
            break;
    }
    return result;
}
 
Example 3
Source File: duckdns.java    From DuckDNSClient with GNU General Public License v2.0 6 votes vote down vote up
private static String getip() {
Document doc = null;
String url = "http://checkip.amazonaws.com";
String ip = "";

try {
doc = Jsoup.connect(url).header("Cache-Control", "no-cache").ignoreHttpErrors(true).ignoreContentType(true).timeout(10 * 1000).get();
ip = doc.text();
} catch (IOException e) {
e.printStackTrace();
}
if (doc.text().length() < 7) {
// can't get ip address, let DuckDNS to resolve it
ip = "";
}
return ip;
}
 
Example 4
Source File: PromPHPCrawlerServiceImpl.java    From ShadowSocks-Share with Apache License 2.0 5 votes vote down vote up
@Override
protected Set<ShadowSocksDetailsEntity> parse(Document document) {

	// SSR 订阅地址内容
	String base64ssrLinks = document.text();
	String ssrLinks = StringUtils.toEncodedString(Base64.decodeBase64(base64ssrLinks), StandardCharsets.UTF_8);
	String[] ssrLinkList = ssrLinks.split("\n");

	// log.debug("---------------->{}={}", ssrLinkList.length + "", ssrLinkList);
	Set<ShadowSocksDetailsEntity> set = Collections.synchronizedSet(new HashSet<>(ssrLinkList.length));

	Arrays.asList(ssrLinkList).parallelStream().forEach((str) -> {
		try {
			if (StringUtils.isNotBlank(str)) {
				ShadowSocksDetailsEntity ss = parseLink(str.trim());
				ss.setValid(false);
				ss.setValidTime(new Date());
				ss.setTitle("免费账号 | 云端框架");
				ss.setRemarks("https://cloudfra.com/");
				ss.setGroup("ShadowSocks-Share");

				// 测试网络
				if (isReachable(ss))
					ss.setValid(true);

				// 无论是否可用都入库
				set.add(ss);

				log.debug("*************** 第 {} 条 ***************{}{}", set.size(), System.lineSeparator(), ss);
				// log.debug("{}", ss.getLink());
			}
		} catch (Exception e) {
			log.error(e.getMessage(), e);
		}
	});

	return set;
}
 
Example 5
Source File: CharRepresentation.java    From semanticvectors with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private void addHtmlFileToCharRep(File inputFile) throws IOException {
  Document doc = Jsoup.parse(inputFile, ENCODING);
  String innerText = doc.text();
  java.util.StringTokenizer tokenizer = new java.util.StringTokenizer(innerText, " ");
  while (tokenizer.hasMoreTokens()) {
    String token = tokenizer.nextToken();
    addStringToCharRep(token);
  }
}
 
Example 6
Source File: RandomGoogleTrendExtractor.java    From swcv with MIT License 5 votes vote down vote up
private static String[] downloadTrends() throws Exception
{
    String query = constructQuery();
    InputStream is = new URL(query).openStream();
    try
    {
        StringWriter writer = new StringWriter();
        IOUtils.copy(is, writer);
        JSONObject json = new JSONObject(writer.toString());
        json = json.getJSONObject("responseData");
        json = json.getJSONObject("feed");

        JSONArray arr = json.getJSONArray("entries");
        List<String> res = new ArrayList();
        for (int i = 0; i < arr.length(); i++)
        {
            JSONObject tmp = arr.getJSONObject(i);
            String en = tmp.getString("content");

            Document doc = Jsoup.parse(en);
            for (Element t : doc.select("a"))
                t.append(".");
            
            en = doc.text();
            res.addAll(Arrays.asList(en.split("\\.")));
        }

        return getNonEmptyTrends(res);
    }
    finally
    {
        is.close();
    }
}
 
Example 7
Source File: duckdns.java    From DuckDNSClient with GNU General Public License v2.0 5 votes vote down vote up
private static String updateDuckDNS(String domain, String token, String ipaddress) {
String url = "http://www.duckdns.org/update?domains=" + domain + "&token=" + token + "&ip=" + ipaddress;
Document doc = null;
String ua = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2";
try {
//	doc = Jsoup.connect(url).ignoreHttpErrors(true).timeout(10 * 1000).get();
doc = Jsoup.connect(url).userAgent(ua).ignoreHttpErrors(true).ignoreContentType(true).timeout(10 * 1000).get();

} catch (IOException e) {
e.printStackTrace();
}

return doc.text();
}
 
Example 8
Source File: MainFragment.java    From android-app with The Unlicense 5 votes vote down vote up
@Override
protected Void doInBackground(Void... params) {
    try {
        Document doc1 = Jsoup.connect("https://evilinsult.com/generate_insult.php?lang="+language).get();
        title = doc1.text();

    } catch (Exception ex) {
        ex.printStackTrace();



    }
    return null;
}
 
Example 9
Source File: MainFragment.java    From android-app with The Unlicense 5 votes vote down vote up
@Override
protected Void doInBackground(Void... params) {
    try {
        Document doc1 = Jsoup.connect("https://evilinsult.com/generate_insult.php?lang="+language).get();
        title = doc1.text();

    } catch (Exception ex) {
        ex.printStackTrace();



    }
    return null;
}
 
Example 10
Source File: MainActivity.java    From android-app with The Unlicense 5 votes vote down vote up
@Override
protected Void doInBackground(Void... params) {
    try {
        Document doc1 = Jsoup.connect("https://evilinsult.com/generate_insult.php?lang=en").get();
        title = doc1.text();
    } catch (Exception ex) {
        ex.printStackTrace();
        title= ex.getMessage();

    }
    return null;
}
 
Example 11
Source File: MainActivity.java    From android-app with The Unlicense 5 votes vote down vote up
@Override
protected Void doInBackground(Void... params) {
    try {
        Document doc1 = Jsoup.connect("http://evilinsult.com/generate_insult.php?lang=en").get();
        title = doc1.text();
    } catch (Exception ex) {
        ex.printStackTrace();
        title= ex.getMessage();

    }
    return null;
}
 
Example 12
Source File: TextPageProcessor.java    From ScriptSpider with Apache License 2.0 5 votes vote down vote up
/**
 * 解析页面
 * process函数需要完成的有:
 * 1.解析有用的信息,丢进去Page的List items中。之后save会进行存储!
 *
 * @param page page
 */
public void process(Page page) {
    Document doc = page.getDocument();

    String title = doc.title();
    String text = doc.text();
    Map<String, String> items = new HashMap<String, String>();
    items.put("title", title);
    items.put("text", text);
    items.put("url", page.getUrlSeed().getUrl());

    page.setItems(items);
}
 
Example 13
Source File: PlumbleService.java    From Plumble with GNU General Public License v3.0 4 votes vote down vote up
@Override
public void onMessageLogged(IMessage message) {
    // Split on / strip all HTML tags.
    Document parsedMessage = Jsoup.parseBodyFragment(message.getMessage());
    String strippedMessage = parsedMessage.text();

    String ttsMessage;
    if(mShortTtsMessagesEnabled) {
        for (Element anchor : parsedMessage.getElementsByTag("A")) {
            // Get just the domain portion of links
            String href = anchor.attr("href");
            // Only shorten anchors without custom text
            if (href != null && href.equals(anchor.text())) {
                String urlHostname = HtmlUtils.getHostnameFromLink(href);
                if (urlHostname != null) {
                    anchor.text(getString(R.string.chat_message_tts_short_link, urlHostname));
                }
            }
        }
        ttsMessage = parsedMessage.text();
    } else {
        ttsMessage = strippedMessage;
    }

    String formattedTtsMessage = getString(R.string.notification_message,
            message.getActorName(), ttsMessage);

    // Read if TTS is enabled, the message is less than threshold, is a text message, and not deafened
    if(mSettings.isTextToSpeechEnabled() &&
            mTTS != null &&
            formattedTtsMessage.length() <= TTS_THRESHOLD &&
            getSessionUser() != null &&
            !getSessionUser().isSelfDeafened()) {
        mTTS.speak(formattedTtsMessage, TextToSpeech.QUEUE_ADD, null);
    }

    // TODO: create a customizable notification sieve
    if (mSettings.isChatNotifyEnabled()) {
        mMessageNotification.show(message);
    }

    mMessageLog.add(new IChatMessage.TextMessage(message));
}
 
Example 14
Source File: CheckUtils.java    From Natty with GNU General Public License v3.0 4 votes vote down vote up
public static String stripTags(String html) {
    Document doc = Jsoup.parse(html);
    return doc.text();
}
 
Example 15
Source File: ActivityMgmtService.java    From symphonyx with Apache License 2.0 4 votes vote down vote up
/**
 * Collects 1A0001.
 *
 * @param userId the specified user id
 * @return result
 */
public synchronized JSONObject collect1A0001(final String userId) {
    final JSONObject ret = Results.falseResult();

    if (!activityQueryService.is1A0001Today(userId)) {
        ret.put(Keys.MSG, langPropsService.get("activityNotParticipatedLabel"));

        return ret;
    }

    if (activityQueryService.isCollected1A0001Today(userId)) {
        ret.put(Keys.MSG, langPropsService.get("activityParticipatedLabel"));

        return ret;
    }

    final List<JSONObject> records = pointtransferQueryService.getLatestPointtransfers(userId,
            Pointtransfer.TRANSFER_TYPE_C_ACTIVITY_1A0001, 1);
    final JSONObject pointtransfer = records.get(0);
    final String data = pointtransfer.optString(Pointtransfer.DATA_ID);
    final String smallOrLarge = data.split("-")[1];
    final int sum = pointtransfer.optInt(Pointtransfer.SUM);

    String smallOrLargeResult = null;
    try {
        final Document doc = Jsoup.parse(new URL("http://stockpage.10jqka.com.cn/1A0001/quote/header/"), 5000);
        final JSONObject result = new JSONObject(doc.text());
        final String price = result.optJSONObject("data").optJSONObject("1A0001").optString("10");

        if (!price.contains(".")) {
            smallOrLargeResult = "0";
        } else {
            int endInt = 0;
            if (price.split("\\.")[1].length() > 1) {
                final String end = price.substring(price.length() - 1);
                endInt = Integer.valueOf(end);
            }

            if (0 <= endInt && endInt <= 4) {
                smallOrLargeResult = "0";
            } else if (5 <= endInt && endInt <= 9) {
                smallOrLargeResult = "1";
            } else {
                LOGGER.error("Activity 1A0001 collect result [" + endInt + "]");
            }
        }
    } catch (final Exception e) {
        LOGGER.log(Level.ERROR, "Collect 1A0001 failed", e);

        ret.put(Keys.MSG, langPropsService.get("activity1A0001CollectFailLabel"));

        return ret;
    }

    if (Strings.isEmptyOrNull(smallOrLarge)) {
        ret.put(Keys.MSG, langPropsService.get("activity1A0001CollectFailLabel"));

        return ret;
    }

    ret.put(Keys.STATUS_CODE, true);
    if (StringUtils.equals(smallOrLarge, smallOrLargeResult)) {
        final int amount = sum * 2;

        final boolean succ = null != pointtransferMgmtService.transfer(Pointtransfer.ID_C_SYS, userId,
                Pointtransfer.TRANSFER_TYPE_C_ACTIVITY_1A0001_COLLECT, amount,
                DateFormatUtils.format(new Date(), "yyyyMMdd") + "-" + smallOrLargeResult);

        if (succ) {
            String msg = langPropsService.get("activity1A0001CollectSucc1Label");
            msg = msg.replace("{point}", String.valueOf(amount));

            ret.put(Keys.MSG, msg);
        } else {
            ret.put(Keys.MSG, langPropsService.get("activity1A0001CollectFailLabel"));
        }
    } else {
        ret.put(Keys.MSG, langPropsService.get("activity1A0001CollectSucc0Label"));
    }

    return ret;
}
 
Example 16
Source File: Jsoup403ForbiddenExample.java    From crawler-jsoup-maven with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
    
    try{
        
        // connect to the website         '1
        Connection connection = Jsoup.connect("http://www.bluetata.com");
        
        // get the HTML document          '2
        Document doc = connection.get();
        
        // parse text from HTML           '3
        String strHTML = doc.text();
        
        // out put dom                    '4
        System.out.println(strHTML);
        
    }catch(IOException ioex){
        ioex.printStackTrace();
    }
 
}
 
Example 17
Source File: HtmlDocumentInterpreter.java    From camunda-bpm-swagger with Apache License 2.0 4 votes vote down vote up
String getText(final HtmlBlock node) {
  final Document document = Jsoup.parseBodyFragment(node.getChars().toString());
  return document.text();
}
 
Example 18
Source File: RSSScraper.java    From Babler with Apache License 2.0 4 votes vote down vote up
public AbstractMap.SimpleEntry<Integer, Integer> fetchAndSave() throws Exception {

        URL url = new URL(this.url);

        SyndFeedInput input = new SyndFeedInput();
        SyndFeed feed = input.build(new XmlReader(url));


        int items = feed.getEntries().size();

        if(items > 0){
            log.info("Attempting to parse rss feed: "+ this.url );
            log.info("This Feed has "+items +" items");
        }

        List <SyndEntry> entries = feed.getEntries();

        for (SyndEntry item : entries){
            log.info("Title: " + item.getTitle());
            log.info("Link: " + item.getLink());
            SyndContentImpl contentHolder = (SyndContentImpl) item.getContents().get(0);
            String content = contentHolder.getValue();

            //content might contain html data, let's clean it up
            Document doc = Jsoup.parse(content);
            content = doc.text();
            try {
                    Result result = ld.detectLanguage(content, language);
                    if (result.languageCode.equals(language) && result.isReliable) {

                        FileSaver file = new FileSaver(content, this.language, "bs", item.getLink(), item.getUri(), String.valueOf(content.hashCode()));
                        String fileName = file.getFileName();
                        BlogPost post = new BlogPost(content,this.language,null,"bs",item.getLink(),item.getUri(),fileName);
                        if(DAO.saveEntry(post)) {
                            file.save(this.logDb);
                            numOfFiles++;
                            wrongCount = 0;
                        }

                    }

                    else{
                        log.info("Item " + item.getTitle() + "is in a diff languageCode, skipping this post  "+ result.languageCode);
                        wrongCount ++;
                        if(wrongCount > 3){
                            log.info("Already found 3 posts in the wrong languageCode, skipping this blog");
                        }
                        break;
                    }

            }
            catch(Exception e){
                log.error(e);
                break;
            }


        }
        return new AbstractMap.SimpleEntry<>(numOfFiles,wrongCount);
    }
 
Example 19
Source File: HTMLSanitizer.java    From document-management-software with GNU Lesser General Public License v3.0 4 votes vote down vote up
public static String stripIframes(String htmlContent) {
	Document doc = Jsoup.parse(htmlContent, "UTF-8");
	doc.select("iframe").remove();
	return doc.text();
}
 
Example 20
Source File: CharRepresentation.java    From semanticvectors with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
private void addHtmlFileToTermRep(File inputFile) throws IOException {
  Document doc = Jsoup.parse(inputFile, ENCODING);
  String innerText = doc.text();
  addStringToTermRep(innerText);
}