org.jsoup.safety.Whitelist Java Examples

The following examples show how to use org.jsoup.safety.Whitelist. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: WordCounterImpl.java    From cia with Apache License 2.0 6 votes vote down vote up
public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData, final int maxResult) {

		final String html = documentContentData.getContent();
		
		final SimpleCorpus simpleCorpus = new SimpleCorpus(SimpleSentenceSplitter.getInstance(), new SimpleTokenizer(),
				new SwedishStopWords(), EnglishPunctuations.getInstance());

		simpleCorpus.add(new Text( Jsoup.clean(html, Whitelist.basic())));

		final Iterator<String> terms = simpleCorpus.getTerms();

		final Map<String, Integer> result = new HashMap<>();
		while (terms.hasNext()) {
			final String term = terms.next();
			result.put(term, simpleCorpus.getTermFrequency(term));
		}
		return result;
	}
 
Example #2
Source File: JsoupTest.java    From crawler-jsoup-maven with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    
    String d = "<span><div>test</div></span>";
    Document doc = Jsoup.parse(d);
    Element div = doc.select("div").first(); // <div></div>
    div.html("<p>lorem ipsum</p>"); // <div><p>lorem ipsum</p></div>
    div.prepend("<p>First</p>");
    div.append("<p>Last</p>");
    // now: <div><p>First</p><p>lorem ipsum</p><p>Last</p></div>
    div.appendElement(d);
    Element span = doc.select("span").first(); // <span>One</span>
    span.wrap("<li><a href='http://example.com/'></a></li>");
    // now: <li><a href="http://example.com"><span>One</span></a></li>
    System.out.println(doc.html());
    
    String s = Jsoup.clean(doc.html(), "", Whitelist.relaxed(), new OutputSettings().prettyPrint(false));
    
    System.out.println(s);
}
 
Example #3
Source File: TRECAquaintDocumentIndexer.java    From lucene4ir with Apache License 2.0 6 votes vote down vote up
public TRECAquaintDocumentIndexer(String indexPath, String tokenFilterFile, boolean positional){
    super(indexPath, tokenFilterFile, positional);

    try {
        whiteList = Whitelist.relaxed();
        whiteList.addTags("docno");
        whiteList.addTags("doc");
        whiteList.addTags("headline");
        whiteList.addTags("text");
        whiteList.addTags("date_time");
        whiteList.addTags("slug");
    } catch (Exception e){
        System.out.println(" caught a " + e.getClass() +
                "\n with message: " + e.getMessage());
    }

    doc = new Document();
    initFields();
    initAQUAINTDoc();
}
 
Example #4
Source File: JEditorPaneBrowser.java    From MtgDesktopCompanion with GNU General Public License v3.0 6 votes vote down vote up
@Override
public void loadURL(String url) {
	logger.debug("loading " + url);
	
	ThreadManager.getInstance().executeThread(()->{
			try {
				
				Whitelist w = Whitelist.basic();
				w.addTags("img");
				w.addAttributes("img", "src");
				
				String contf = Jsoup.clean(RequestBuilder.build().clean().url(url).method(METHOD.GET).setClient(client).toHtml().html(),w);
				browse.setText(contf);
			}
			catch(Exception e)
			{
				logger.error(e);
				browse.setText("Error " + e);
			}
			
	}, "loading " + url);
	
	
	
}
 
Example #5
Source File: RouteNotFoundError.java    From flow with Apache License 2.0 6 votes vote down vote up
@Override
public int setErrorParameter(BeforeEnterEvent event,
        ErrorParameter<NotFoundException> parameter) {
    String path = event.getLocation().getPath();
    String additionalInfo = "";
    if (parameter.hasCustomMessage()) {
        additionalInfo = "Reason: " + parameter.getCustomMessage();
    }
    path = Jsoup.clean(path, Whitelist.none());
    additionalInfo = Jsoup.clean(additionalInfo, Whitelist.none());

    boolean productionMode = event.getUI().getSession().getConfiguration()
            .isProductionMode();

    String template = getErrorHtml(productionMode);
    template = template.replace("{{path}}", path);
    template = template.replace("{{additionalInfo}}", additionalInfo);
    if (template.contains("{{routes}}")) {
        template = template.replace("{{routes}}", getRoutes(event));
    }

    getElement().appendChild(new Html(template).getElement());
    return HttpServletResponse.SC_NOT_FOUND;
}
 
Example #6
Source File: TopicApiController.java    From pybbs with GNU Affero General Public License v3.0 6 votes vote down vote up
@PutMapping(value = "/{id}")
public Result edit(@PathVariable Integer id, @RequestBody Map<String, String> body) {
    User user = getApiUser();
    String title = body.get("title");
    String content = body.get("content");
    ApiAssert.notEmpty(title, "请输入标题");
    // 更新话题
    Topic topic = topicService.selectById(id);
    ApiAssert.isTrue(topic.getUserId().equals(user.getId()), "谁给你的权限修改别人的话题的?");
    topic.setTitle(Jsoup.clean(title, Whitelist.none().addTags("video")));
    topic.setContent(content);
    topic.setModifyTime(new Date());
    topicService.update(topic, null);
    topic.setContent(SensitiveWordUtil.replaceSensitiveWord(topic.getContent(), "*", SensitiveWordUtil.MinMatchType));
    return success(topic);
}
 
Example #7
Source File: TopicService.java    From pybbs with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public void update(Topic topic, String tags) {
    topicMapper.updateById(topic);
    // 处理标签
    if (!StringUtils.isEmpty(tags)) {
        // 旧标签每个topicCount都-1
        tagService.reduceTopicCount(topic.getId());
        if (!StringUtils.isEmpty(tags)) {
            // 保存标签
            List<Tag> tagList = tagService.insertTag(Jsoup.clean(tags, Whitelist.none()));
            // 处理标签与话题的关联
            topicTagService.insertTopicTag(topic.getId(), tagList);
        }
    }
    // 索引话题
    indexedService.indexTopic(String.valueOf(topic.getId()), topic.getTitle(), topic.getContent());
}
 
Example #8
Source File: SanitizeActivity.java    From android-opensource-library-56 with Apache License 2.0 6 votes vote down vote up
@Override
protected void onCreate(Bundle savedInstanceState) {
    super.onCreate(savedInstanceState);
    setContentView(R.layout.activity_sanitize);

    final EditText inputText = (EditText) findViewById(R.id.input_text);
    inputText
            .setText("<p><a href='http://example.com/' onclick='doAttack()'>Link</a></p>");
    final EditText sanitizedText = (EditText) findViewById(R.id.sanitized_text);
    findViewById(R.id.sanitize_button).setOnClickListener(
            new OnClickListener() {
                @Override
                public void onClick(View v) {
                    String sanitized = Jsoup.clean(inputText.getText()
                            .toString(), Whitelist.basic());
                    sanitizedText.setText(sanitized);
                }
            });
}
 
Example #9
Source File: XHTMLDocumentHandler.java    From docx4j-template with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
	String baseUri = "http://www.baidu.com";
	String html = "<a href=\"http://www.baidu.com/gaoji/preferences.html\"name=\"tj_setting\">搜索设置</a>";
	String doc = Jsoup.clean(html, baseUri, Whitelist.none());
	System.out.println(doc);
	System.out.println("*******");
	doc = Jsoup.clean(html, baseUri, Whitelist.simpleText());
	System.out.println(doc);
	System.out.println("*******");
	doc = Jsoup.clean(html, baseUri, Whitelist.basic());
	System.out.println(doc);
	System.out.println("*******");
	doc = Jsoup.clean(html, baseUri, Whitelist.basicWithImages());
	System.out.println(doc);
	System.out.println("*******");
	doc = Jsoup.clean(html, baseUri, Whitelist.relaxed());
	System.out.println(doc);

}
 
Example #10
Source File: XssHttpServletRequestWrapper.java    From supplierShop with MIT License 6 votes vote down vote up
@Override
public String[] getParameterValues(String name)
{
    String[] values = super.getParameterValues(name);
    if (values != null)
    {
        int length = values.length;
        String[] escapseValues = new String[length];
        for (int i = 0; i < length; i++)
        {
            // 防xss攻击和过滤前后空格
            escapseValues[i] = Jsoup.clean(values[i], Whitelist.relaxed()).trim();
        }
        return escapseValues;
    }
    return super.getParameterValues(name);
}
 
Example #11
Source File: XssHttpServletRequestWrapper.java    From ruoyiplus with MIT License 6 votes vote down vote up
@Override
public String[] getParameterValues(String name)
{
    String[] values = super.getParameterValues(name);
    if (values != null)
    {
        int length = values.length;
        String[] escapseValues = new String[length];
        for (int i = 0; i < length; i++)
        {
            // 防xss攻击和过滤前后空格
            escapseValues[i] = Jsoup.clean(values[i], Whitelist.relaxed()).trim();
        }
        return escapseValues;
    }
    return super.getParameterValues(name);
}
 
Example #12
Source File: TextFilterManage.java    From bbs with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * 过滤所有的标签,只返回文本
 * @param html
 * @return
 */
public String filterText(String html) {  
	if(StringUtils.isBlank(html)) return ""; 
	return Jsoup.clean(html, Whitelist.none()); //只保留文本,其他所有的html内容均被删除
	
	//doc.text()或Jsoup.clean提取出文本,注意text会将p等标签转为空格而不是换行符,而clean默认会转为换行符。
	
	
	//只保留文本,其他所有的html内容均被删除
	//return Jsoup.clean(html, "",Whitelist.none(),new OutputSettings().prettyPrint(false)); //prettyPrint(是否重新格式化)
}
 
Example #13
Source File: TextFilterManage.java    From bbs with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * 富文本过滤标签
 * @param request
 * @param html
 * @return
 */
public String filterTag(HttpServletRequest request,String html) {  
	if(StringUtils.isBlank(html)) return ""; 
	Whitelist whitelist = this.filterParameter(null);

    //return Jsoup.clean(html, Configuration.getUrl(request),whitelist); 
	return Jsoup.clean(html, Configuration.getUrl(request),whitelist,new OutputSettings().prettyPrint(false)); //prettyPrint(是否重新格式化)
}
 
Example #14
Source File: XssHttpServletRequestWrapper.java    From RuoYi with Apache License 2.0 5 votes vote down vote up
@Override
public String[] getParameterValues(String name) {
    String[] values = super.getParameterValues(name);
    if (values != null) {
        int length = values.length;
        String[] escapseValues = new String[length];
        for (int i = 0; i < length; i++) {
            // 防xss攻击和过滤前后空格
            escapseValues[i] = Jsoup.clean(values[i], Whitelist.relaxed()).trim();
        }
        return escapseValues;
    }
    return super.getParameterValues(name);
}
 
Example #15
Source File: TextFilterManage.java    From bbs with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * 过滤标签并删除<hide>标签所有内容,只返回文本
 * @param html
 * @return
 */
public String filterHideText(String html) {  
	if(StringUtils.isBlank(html)) return ""; 
	String newHtml = this.deleteHiddenTag(html);
	if(StringUtils.isBlank(newHtml)) return ""; 
	//只保留文本,其他所有的html内容均被删除
	return Jsoup.clean(newHtml, Whitelist.none()); 
	//return Jsoup.clean(newHtml,"", Whitelist.none(),new OutputSettings().prettyPrint(false)); //prettyPrint(是否重新格式化)
}
 
Example #16
Source File: Utilities.java    From inception with Apache License 2.0 5 votes vote down vote up
public static String cleanHighlight(String aHighlight) {
    Whitelist wl = new Whitelist();
    wl.addTags("em");
    Document dirty = Jsoup.parseBodyFragment(aHighlight, "");
    Cleaner cleaner = new Cleaner(wl);
    Document clean = cleaner.clean(dirty);
    clean.select("em").tagName("mark");

    return clean.body().html();
}
 
Example #17
Source File: HtmlParserTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void selfClosingVoidIsNotAnError() {
    String html = "<p>test<br/>test<br/></p>";
    Parser parser = Parser.htmlParser().setTrackErrors(5);
    parser.parseInput(html, "");
    assertEquals(0, parser.getErrors().size());

    assertTrue(Jsoup.isValid(html, Whitelist.basic()));
    String clean = Jsoup.clean(html, Whitelist.basic());
    assertEquals("<p>test<br>test<br></p>", clean);
}
 
Example #18
Source File: HtmlParser.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private static Whitelist whitelist(String extraTagToRemoveWhitelist, String extraTagToAddWhiteList)
{	
	Whitelist newWhiteList = whitelist;
	if(extraTagToRemoveWhitelist!=null && !extraTagToRemoveWhitelist.isEmpty())
		newWhiteList.removeTags(extraTagToRemoveWhitelist);
	if(extraTagToAddWhiteList!=null && !extraTagToAddWhiteList.isEmpty())
		newWhiteList.addTags(extraTagToAddWhiteList);
	return newWhiteList;
}
 
Example #19
Source File: HtmlParser.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private static Whitelist whitelist(String[] extraTagsToRemoveWhitelist, String[] extraTagsToAddWhiteList)
{		
	Whitelist newWhiteList = whitelist;
	if(extraTagsToRemoveWhitelist!=null && extraTagsToRemoveWhitelist.length>0)
		newWhiteList.removeTags(extraTagsToRemoveWhitelist);
	if(extraTagsToAddWhiteList!=null && extraTagsToAddWhiteList.length>0)
		newWhiteList.addTags(extraTagsToAddWhiteList);
	return whitelist.removeTags(extraTagsToRemoveWhitelist);
}
 
Example #20
Source File: HtmlParser.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private static List<String> parse(String input, Whitelist wl)
{
	String cleanInput=Jsoup.clean(input, "", wl, outputSettings);
	//System.out.println(input);
	Document document = Jsoup.parse(cleanInput);
	
	document.outputSettings(outputSettings);
	
	List<String> textList = new ArrayList<String>();

	readNodes(document.body().childNodes(), textList);
	return textList;
}
 
Example #21
Source File: HtmlParser.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private static List<Map.Entry<String,String>> parseWithTags(String input, Whitelist wl)
{
	String cleanInput=Jsoup.clean(input, "", wl, outputSettings);
	Document document = Jsoup.parse(cleanInput);
	
	document.outputSettings(outputSettings);
	
	List<Map.Entry<String,String>> textListMap = new ArrayList<Map.Entry<String,String>>();

	readNodesWithTags(document.body().childNodes(), textListMap,"body");
	return textListMap;
}
 
Example #22
Source File: ElasticsearchDocumentWriter.java    From newsleak with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * Replace html line breaks and &gt; &lt; entities.
 *
 * @param html
 *            the html
 * @return the string
 */
public static String replaceHtmlLineBreaks(String html) {
	if (html == null)
		return html;
	Document document = Jsoup.parse(html);
	// makes html() preserve linebreaks and spacing
	document.outputSettings(new Document.OutputSettings().prettyPrint(false));
	document.select("br").append("\\n");
	document.select("p").prepend("\\n\\n");
	String s = document.html().replaceAll("\\\\n", "\n");
	String cleanedString = Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
	cleanedString = cleanedString.replaceAll("&gt;", ">");
	cleanedString = cleanedString.replaceAll("&lt;", "<");
	return cleanedString;
}
 
Example #23
Source File: HtmlParserTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void selfClosingOnNonvoidIsError() {
    String html = "<p>test</p><div /><div>Two</div>";
    Parser parser = Parser.htmlParser().setTrackErrors(5);
    parser.parseInput(html, "");
    assertEquals(1, parser.getErrors().size());
    assertEquals("18: Tag cannot be self closing; not a void tag", parser.getErrors().get(0).toString());

    assertFalse(Jsoup.isValid(html, Whitelist.relaxed()));
    String clean = Jsoup.clean(html, Whitelist.relaxed());
    assertEquals("<p>test</p> <div></div> <div> Two </div>", StringUtil.normaliseWhitespace(clean));
}
 
Example #24
Source File: XSSTestCase.java    From symphonyx with Apache License 2.0 5 votes vote down vote up
@Test
public void xss() {
    String src = "http://error\"  onerror=\"this.src='http://7u2fje.com1.z0.glb.clouddn.com/girl.jpg';this.removeAttribute('onerror');if(!window.a){console.log('Where am I ?');window.a=1}";
    assertFalse(Jsoup.isValid("<img src=\"" + src + "\"/>", Whitelist.basicWithImages()));

    src = "http://7u2fje.com1.z0.glb.clouddn.com/girl.jpg";
    assertTrue(Jsoup.isValid("<img src=\"" + src + "\"/>", Whitelist.basicWithImages()));
}
 
Example #25
Source File: Marked.java    From markedj with Apache License 2.0 5 votes vote down vote up
public static String marked(String src, Options options, Renderer renderer){
    Lexer lexer = new Lexer(options);
    Lexer.LexerResult result = lexer.lex(src);
    Parser parser = new Parser(options, renderer);
    String html = parser.parse(result.getTokens(), result.getLinks());

    Whitelist whitelist = options.getWhitelist();

    if(whitelist != null) {
        return Jsoup.clean(html, whitelist);
    } else {
        return html;
    }
}
 
Example #26
Source File: PostService.java    From expper with GNU General Public License v3.0 5 votes vote down vote up
@Transactional
@Timed
@Caching(evict = {
    @CacheEvict(value = TagService.CACHE_COUNT_USER, key = "#postDTO.userId.toString().concat('_posts_count')"),
    @CacheEvict(value = CACHE_COUNT_USER_TAG_POSTS, key = "#postDTO.userId.toString().concat('_tags_posts_count')", allEntries = true),
})
public Optional<Post> createPost(PostDTO postDTO) throws JSONException {
    Post post = postMapper.postDTOToPost(postDTO);
    String result = getWebPost(String.format(SERVER_URL, post.getUrl()));

    if (result == null)
        return Optional.empty();

    JSONObject json = new JSONObject(result);
    String content = json.getString("content");

    // Filter html tags
    content = Jsoup.clean(content, Whitelist.relaxed());

    post.setTitle(json.getString("title"));
    post.setTitle(post.getTitle().substring(0, Math.min(255, post.getTitle().length())));
    post.setContent(content);
    post.setDomain(json.getString("host"));

    updateTags(post, null);
    saveNewPost(post);

    return Optional.of(post);
}
 
Example #27
Source File: RabbitConsumer.java    From expper with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Get full text of a post
 */
public void getArticle(Post post) {
    log.debug("Handle crawling article full text from source site, id=" + post.getId() + " , url=" + post.getUrl());

    try {
        String result = postService.getWebPost(String.format(PostService.SERVER_URL, post.getUrl()));
        if (result == null) {
            log.error("Failed to get article full text, id=" + post.getId());
            return;
        }

        Post resultPost = postRepository.findOne(post.getId());
        if (resultPost == null) {
            log.warn("Cancel crawling article full text of post id=" + post.getId() + ", because the post does not exist.");
            return;
        }

        JSONObject json = new JSONObject(result);
        String content = json.getString("content");
        content = Jsoup.clean(content, Whitelist.relaxed());
        resultPost.setContent(content);

        postService.saveNewPost(resultPost);
    } catch (Exception e) {
        log.error("Failed to resolve article full text, id=" + post.getId() + ", url=" + post.getUrl() + ", exception: " + e.getMessage());
    }
}
 
Example #28
Source File: HtmlTextFilter.java    From voj with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 过滤包含HTML字符串.
 * @param text - 待过滤的字符串
 * @return 过滤后的字符串.
 */
public static String filter(String text) {
	if ( text == null ) {
		return text;
	}
	
	Document document = Jsoup.parse(text);
	document.outputSettings(new Document.OutputSettings().prettyPrint(false));
	document.select("br").append("\\n");
	document.select("p").prepend("\\n\\n");
	String s = document.html().replaceAll("\\\\n", "\n");
	return Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
}
 
Example #29
Source File: HtmlParserTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void selfClosingVoidIsNotAnError() {
    String html = "<p>test<br/>test<br/></p>";
    Parser parser = Parser.htmlParser().setTrackErrors(5);
    parser.parseInput(html, "");
    assertEquals(0, parser.getErrors().size());

    assertTrue(Jsoup.isValid(html, Whitelist.basic()));
    String clean = Jsoup.clean(html, Whitelist.basic());
    assertEquals("<p>test<br>test<br></p>", clean);
}
 
Example #30
Source File: HtmlParserTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void selfClosingOnNonvoidIsError() {
    String html = "<p>test</p><div /><div>Two</div>";
    Parser parser = Parser.htmlParser().setTrackErrors(5);
    parser.parseInput(html, "");
    assertEquals(1, parser.getErrors().size());
    assertEquals("18: Tag cannot be self closing; not a void tag", parser.getErrors().get(0).toString());

    assertFalse(Jsoup.isValid(html, Whitelist.relaxed()));
    String clean = Jsoup.clean(html, Whitelist.relaxed());
    assertEquals("<p>test</p> <div></div> <div> Two </div>", StringUtil.normaliseWhitespace(clean));
}