Java Code Examples for org.jsoup.nodes.Document#title()

The following examples show how to use org.jsoup.nodes.Document#title() . These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
private List<String> parseData(String html) {
    //jsoup解析数据
    Document document = Jsoup.parse(html);
    String title = document.title();
    ArrayList<String> strings = new ArrayList<>();
    strings.add(title);

    Elements ul = document.getElementsByTag("ul");
    for (Element element : ul) {
        if (ul.hasClass("panel_body itemlist")) {
            Elements a = element.getElementsByTag("a");
            for (Element aa : a) {
                if (aa.ownText().length() > 20)
                    strings.add(aa.ownText());
            }
        }
    }
    return strings;
}
 
Example 2
Source Project: ripme   File: TwitchVideoRipper.java    License: MIT License 6 votes vote down vote up
@Override
public void rip() throws IOException {
    LOGGER.info("Retrieving " + this.url);
    Document doc = Http.url(url).get();
    
    //Get user friendly filename from page title
    String title = doc.title();
    
    Elements script = doc.select("script");
    if (script.isEmpty()) {
        throw new IOException("Could not find script code at " + url);
    }
    //Regex assumes highest quality source is listed first
    Pattern p = Pattern.compile("\"source\":\"(.*?)\"");
    
    for (Element element : script) {
        Matcher m = p.matcher(element.data());
        if (m.find()){
            String vidUrl = m.group(1);
            addURLToDownload(new URL(vidUrl), HOST + "_" + title);
        }
    }
    waitForThreads();
}
 
Example 3
public void extractDataWithJsoup(String href){
	Document doc = null;
	try {
		doc = Jsoup.connect(href).timeout(10*1000).userAgent("Mozilla").ignoreHttpErrors(true).get();
	} catch (IOException e) {
		//Your exception handling here
	}
	if(doc != null){
		String title = doc.title();
		String text = doc.body().text();
		Elements links = doc.select("a[href]");
		for (Element link : links) {
			String linkHref = link.attr("href");
			String linkText = link.text();
			String linkOuterHtml = link.outerHtml(); 
			String linkInnerHtml = link.html();
		}
	}
}
 
Example 4
Source Project: superword   File: DynamicIp.java    License: Apache License 2.0 6 votes vote down vote up
public static boolean isConnected(){
    try {
        Document doc = Jsoup.connect("http://www.baidu.com/s?wd=杨尚川&t=" + System.currentTimeMillis())
                .header("Accept", ACCEPT)
                .header("Accept-Encoding", ENCODING)
                .header("Accept-Language", LANGUAGE)
                .header("Connection", CONNECTION)
                .header("Referer", "https://www.baidu.com")
                .header("Host", "www.baidu.com")
                .header("User-Agent", USER_AGENT)
                .ignoreContentType(true)
                .timeout(30000)
                .get();
        LOGGER.info("搜索结果页面标题:"+doc.title());
        if(doc.title() != null && doc.title().contains("杨尚川")){
            return true;
        }
    }catch (Exception e){
        if("Network is unreachable".equals(e.getMessage())){
            return false;
        }else{
            LOGGER.error("状态检查失败:"+e.getMessage());
        }
    }
    return false;
}
 
Example 5
Source Project: VileBot   File: UrlTitleAnnouncer.java    License: MIT License 6 votes vote down vote up
/**
 * Accesses the source of a HTML page and looks for a title element
 * 
 * @param url http URI String
 * @return String of text between the first <title> tag group on the page, empty if error.
 */
private String scrapeURLHTMLTitle( String url )
{
    String title = "";

    try
    {
        Document doc = Jsoup.connect( url ).get();
        title = doc.title();
    }
    catch ( IOException x )
    {
        System.err.format( "scrapeURLHTMLTitle BufferedReader error: %s%n", x );
    }

    return title;
}
 
Example 6
Source Project: rank   File: DynamicIp.java    License: Apache License 2.0 6 votes vote down vote up
public static boolean isConnected(){
    try {
        Document doc = Jsoup.connect("http://www.baidu.com/s?wd=杨尚川&t=" + System.currentTimeMillis())
                .header("Accept", ACCEPT)
                .header("Accept-Encoding", ENCODING)
                .header("Accept-Language", LANGUAGE)
                .header("Connection", CONNECTION)
                .header("Referer", "https://www.baidu.com")
                .header("Host", "www.baidu.com")
                .header("User-Agent", USER_AGENT)
                .ignoreContentType(true)
                .timeout(30000)
                .get();
        LOGGER.info("搜索结果页面标题:"+doc.title());
        if(doc.title() != null && doc.title().contains("杨尚川")){
            return true;
        }
    }catch (Exception e){
        if("Network is unreachable".equals(e.getMessage())){
            return false;
        }else{
            LOGGER.error("状态检查失败:"+e.getMessage());
        }
    }
    return false;
}
 
Example 7
Source Project: wx-crawl   File: WxCrawlServiceImpl.java    License: Apache License 2.0 5 votes vote down vote up
private String getArticleTitle(Document sourceDoc) {
    String title = "";
    if(sourceDoc.head() != null &&
            StringUtils.isNotEmpty(sourceDoc.head().attr(WxCrawlerConstant.BackupArticle.ARTICLE_TITLE))) {
        title = sourceDoc.head().attr(WxCrawlerConstant.BackupArticle.ARTICLE_TITLE);
    } else if (sourceDoc.select(WxCrawlerConstant.HTMLElementSelector.TITLE).first() != null) {
        title = sourceDoc.select(WxCrawlerConstant.HTMLElementSelector.TITLE).first().text();
    } else {
        title = sourceDoc.title();
    }
    return title;
}
 
Example 8
/**
 * 爬取 ss 账号
 */
public ShadowSocksEntity getShadowSocks() {
	try {
		Document document = getDocument();
		ShadowSocksEntity entity = new ShadowSocksEntity(getTargetURL(), document.title(), true, new Date());
		entity.setShadowSocksSet(parse(document));
		return entity;
	} catch (IOException e) {
		log.error(e.getMessage());
	}
	return new ShadowSocksEntity(getTargetURL(), "", false, new Date());
}
 
Example 9
@Test
public void whenMisSpeltContainersEndpoint_thenNotFoundResponseWithAPIMessage() throws Exception {
    HttpResponse response = getResponse(INVALID_CONTAINER_ENDPOINT, HttpStatus.SC_NOT_FOUND);
    final String htmlResponse = EntityUtils.toString(response.getEntity());
    Document doc = Jsoup.parse(htmlResponse);
    String title = doc.title();
    Elements h1 = doc.select("h1:first-child");
    Elements a = doc.select("a");
    assertNotNull(title);
    assertEquals("404 Not Found", title);
    assertEquals("404 Page Not Found", h1.text());
    assertEquals("Go to Dashboard", a.text());
}
 
Example 10
private void parseCalendar(Document document) throws TopicMapException {
    String title = document.title();
    Topic type = getType("vcalendar");
    Topic topic = getOrCreateTopic(tm,null, title);
    topic.addType(type);
    
    parseCalendar(topic, document.body());
}
 
Example 11
public static List<Entry> getAllLinks(Document webpage) throws Exception {
    List<Entry> urls = new ArrayList<>(); //why
    String title = webpage.title();
    Elements links = webpage.select("a[href]");
    Element link;

    for(int j=0; j < links.size(); j++){
        link=links.get(j);
        final String label = defaultIfBlank(link.text(), link.attr("href"));

        urls.add(new Entry(link.absUrl("href"), label));

    }
    return urls;
}
 
Example 12
Source Project: Java-for-Data-Science   File: JSoupExamples.java    License: MIT License 5 votes vote down vote up
public void displayBodyText(Document document) {
    // Displays the entire body of the document
    String title = document.title();
    out.println("Title: " + title);

    out.println("---Body---");
    Elements element = document.select("body");
    out.println("Text: " + element.text());
}
 
Example 13
Source Project: crawler4j   File: ParserDemo.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void parse(HttpFetchResult result, String url, String threadName, boolean isUpdate) {
	try {
		String html = result.getHtml();
		Document doc = Jsoup.parse(html);
		String title = doc.title();
		logger.info(threadName +" " + title + " " + url + " ");
		
	} catch (Exception e) {
		e.printStackTrace();
	}

}
 
Example 14
Source Project: superword   File: DynamicIp.java    License: Apache License 2.0 5 votes vote down vote up
public static boolean execute(Map<String, String> cookies, String action){
    String url = "http://192.168.0.1/goform/SysStatusHandle";
    Map<String, String> map = new HashMap<>();
    map.put("action", action);
    map.put("CMD", "WAN_CON");
    map.put("GO", "system_status.asp");
    Connection conn = Jsoup.connect(url)
            .header("Accept", ACCEPT)
            .header("Accept-Encoding", ENCODING)
            .header("Accept-Language", LANGUAGE)
            .header("Connection", CONNECTION)
            .header("Host", HOST)
            .header("Referer", REFERER)
            .header("User-Agent", USER_AGENT)
            .ignoreContentType(true)
            .timeout(30000);
    for(String cookie : cookies.keySet()){
        conn.cookie(cookie, cookies.get(cookie));
    }

    String title = null;
    try {
        Connection.Response response = conn.method(Connection.Method.POST).data(map).execute();
        String html = response.body();
        Document doc = Jsoup.parse(html);
        title = doc.title();
        LOGGER.info("操作连接页面标题:"+title);
        Thread.sleep(10000);
    }catch (Exception e){
        LOGGER.error(e.getMessage());
    }
    if("LAN | LAN Settings".equals(title)){
        if(("3".equals(action) && isConnected())
                || ("4".equals(action) && !isConnected())){
            return true;
        }
    }
    return false;
}
 
Example 15
public void onHandleIntent(Intent intent) {
    this.url = intent.getStringExtra("url");
    try {
        // Connect to the web site
        Document document = Jsoup.connect(url).get();
        // Get the html document title
        title = document.title();
    } catch (IOException e) {
        e.printStackTrace();
    }
    Intent resultIntent = new Intent(TITLE_FILTER);
    resultIntent.putExtra("title", title);
    LocalBroadcastManager.getInstance(this).sendBroadcast(resultIntent);
}
 
Example 16
@Override
protected Void doInBackground(Void... params) {
    try {
        // Connect to the web site
        Document document = Jsoup.connect(url).get();
        // Get the html document title
        title = document.title();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return null;
}
 
Example 17
Source Project: danyuan-application   File: AutoGetHtml.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * @throws IOException
 * 方法名: getBody
 * 功 能: TODO(这里用一句话描述这个方法的作用)
 * 参 数: @param url
 * 参 数: @param key
 * 参 数: @return
 * 返 回: String
 * 作 者 : Tenghui.Wang
 * @throws
 */
public static String getBody(String url, String key) throws IOException {
	Document doc = Jsoup.connect("http://www.oschina.net/")
	        
	        .data("query", "Java") // 请求参数
	        
	        .userAgent("Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2") // 设置 User-Agent
	        
	        .cookie("auth", "token") // 设置 cookie
	        
	        .timeout(3000) // 设置连接超时时间
	        
	        .post(); // 使用 POST 方法访问 URL
	
	return doc.title();
}
 
Example 18
Source Project: baleen   File: AbstractHtmlConsumer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
  final File f = getFileName(jCas);
  final DocumentAnnotation da = getDocumentAnnotation(jCas);

  final Document doc =
      Jsoup.parse("<!DOCTYPE html>\n<html lang=\"" + da.getLanguage() + "\"></html>");
  doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
  final Element head = doc.head();

  if (!Strings.isNullOrEmpty(css)) {
    final Element cssLink = head.appendElement("link");
    cssLink.attr("rel", "stylesheet");
    cssLink.attr("href", css);
  }

  final Element charset = head.appendElement("meta");
  charset.attr("charset", "utf-8");

  appendMeta(head, "document.type", da.getDocType());
  appendMeta(head, "document.sourceUri", da.getSourceUri());
  appendMeta(head, "externalId", da.getHash());

  appendMeta(head, "document.classification", da.getDocumentClassification());
  appendMeta(
      head,
      "document.caveats",
      String.join(",", UimaTypesUtils.toArray(da.getDocumentCaveats())));
  appendMeta(
      head,
      "document.releasability",
      String.join(",", UimaTypesUtils.toArray(da.getDocumentReleasability())));

  String title = null;
  for (final Metadata md : JCasUtil.select(jCas, Metadata.class)) {
    appendMeta(head, md.getKey(), md.getValue());
    if ("documentTitle".equalsIgnoreCase(md.getKey())) {
      title = md.getValue();
    }
  }

  if (!Strings.isNullOrEmpty(title)) {
    doc.title(title);
  }

  final Element body = doc.body();

  writeBody(jCas, body);

  try {
    FileUtils.writeStringToFile(f, doc.html(), Charset.defaultCharset());
  } catch (final IOException e) {
    throw new AnalysisEngineProcessException(e);
  }
}
 
Example 19
Source Project: SteamGifts   File: Utils.java    License: MIT License 4 votes vote down vote up
/**
 * The document title is in the format "Game Title - Page X" if we're on /giveaways/id/name/search?page=X,
 * so we strip out the page number.
 */
public static String getPageTitle(Document document) {
    String title = document.title();
    return title.replaceAll(" - Page ([\\d,]+)$", "");
}
 
Example 20
Source Project: ogham   File: HtmlUtils.java    License: Apache License 2.0 2 votes vote down vote up
/**
 * Get the title of the HTML. If no <code>title</code> tag exists, then the
 * title is null.
 * 
 * @param htmlContent
 *            the HTML content that may contain a title
 * @return the title of the HTML or null if none
 */
public static String getTitle(String htmlContent) {
	Document doc = Jsoup.parse(htmlContent);
	Elements titleNode = doc.select("head > title");
	return titleNode.isEmpty() ? null : doc.title();
}