org.jsoup.nodes.Entities Java Examples

The following examples show how to use org.jsoup.nodes.Entities. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CleanerTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test public void supplyOutputSettings() {
    // test that one can override the default document output settings
    Document.OutputSettings os = new Document.OutputSettings();
    os.prettyPrint(false);
    os.escapeMode(Entities.EscapeMode.extended);
    os.charset("ascii");

    String html = "<div><p>&bernou;</p></div>";
    String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
    assertNotSame(defaultOut, customOut);

    assertEquals("<div><p>&bernou;</p></div>", customOut);
    assertEquals("<div>\n" +
        " <p>ℬ</p>\n" +
        "</div>", defaultOut);

    os.charset("ASCII");
    os.escapeMode(Entities.EscapeMode.base);
    String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    assertEquals("<div><p>&#x212c;</p></div>", customOut2);
}
 
Example #2
Source File: CleanerTest.java    From jsoup-learning with MIT License 6 votes vote down vote up
@Test public void supplyOutputSettings() {
    // test that one can override the default document output settings
    Document.OutputSettings os = new Document.OutputSettings();
    os.prettyPrint(false);
    os.escapeMode(Entities.EscapeMode.extended);

    String html = "<div><p>&bernou;</p></div>";
    String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
    assertNotSame(defaultOut, customOut);

    assertEquals("<div><p>&bernou;</p></div>", customOut);
    assertEquals("<div>\n" +
        " <p>ℬ</p>\n" +
        "</div>", defaultOut);

    os.charset("ASCII");
    os.escapeMode(Entities.EscapeMode.base);
    String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    assertEquals("<div><p>&#x212c;</p></div>", customOut2);
}
 
Example #3
Source File: CleanerTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test public void supplyOutputSettings() {
    // test that one can override the default document output settings
    Document.OutputSettings os = new Document.OutputSettings();
    os.prettyPrint(false);
    os.escapeMode(Entities.EscapeMode.extended);
    os.charset("ascii");

    String html = "<div><p>&bernou;</p></div>";
    String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
    assertNotSame(defaultOut, customOut);

    assertEquals("<div><p>&Bscr;</p></div>", customOut); // entities now prefers shorted names if aliased
    assertEquals("<div>\n" +
        " <p>ℬ</p>\n" +
        "</div>", defaultOut);

    os.charset("ASCII");
    os.escapeMode(Entities.EscapeMode.base);
    String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    assertEquals("<div><p>&#x212c;</p></div>", customOut2);
}
 
Example #4
Source File: CleanerTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test public void supplyOutputSettings() {
    // test that one can override the default document output settings
    Document.OutputSettings os = new Document.OutputSettings();
    os.prettyPrint(false);
    os.escapeMode(Entities.EscapeMode.extended);
    os.charset("ascii");

    String html = "<div><p>&bernou;</p></div>";
    String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
    assertNotSame(defaultOut, customOut);

    assertEquals("<div><p>&Bscr;</p></div>", customOut); // entities now prefers shorted names if aliased
    assertEquals("<div>\n" +
        " <p>ℬ</p>\n" +
        "</div>", defaultOut);

    os.charset("ASCII");
    os.escapeMode(Entities.EscapeMode.base);
    String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    assertEquals("<div><p>&#x212c;</p></div>", customOut2);
}
 
Example #5
Source File: Html.java    From zongtui-webcrawler with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Disable jsoup html entity escape. It is a hack way only for jsoup 1.7.2.
 */
private void disableJsoupHtmlEntityEscape() {
	if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) {
		Entities.EscapeMode.base.getMap().clear();
		Entities.EscapeMode.extended.getMap().clear();
		INITED = true;
	}
}
 
Example #6
Source File: HerdStringUtils.java    From herd with Apache License 2.0 5 votes vote down vote up
/**
 * Strips HTML tags from a given input String, allows some tags to be retained via a whitelist
 *
 * @param fragment the specified String
 * @param whitelistTags the specified whitelist tags
 *
 * @return cleaned String with allowed tags
 */
public static String stripHtml(String fragment, String... whitelistTags)
{
    // Unescape HTML.
    String unEscapedFragment = StringEscapeUtils.unescapeHtml4(fragment);

    // Parse out html tags except those from a given list of whitelist tags
    Document dirty = Jsoup.parseBodyFragment(unEscapedFragment);

    Whitelist whitelist = new Whitelist();

    for (String whitelistTag : whitelistTags)
    {
        // Get the actual tag name from the whitelist tag
        // this is vulnerable in general to complex tags but will suffice for our simple needs
        whitelistTag = StringUtils.removePattern(whitelistTag, "[^\\{IsAlphabetic}]");

        // Add all specified tags to the whitelist while preserving inline css
        whitelist.addTags(whitelistTag).addAttributes(whitelistTag, "class");
    }

    Cleaner cleaner = new Cleaner(whitelist);
    Document clean = cleaner.clean(dirty);
    // Set character encoding to UTF-8 and make sure no line-breaks are added
    clean.outputSettings().escapeMode(Entities.EscapeMode.base).charset(StandardCharsets.UTF_8).prettyPrint(false);

    // return 'cleaned' html body
    return clean.body().html();
}
 
Example #7
Source File: Rgaa3Extractor.java    From Asqatasun with GNU Affero General Public License v3.0 5 votes vote down vote up
private static void createTestcaseFiles() throws IOException {
    File srcDir = new File(RGAA3_TESTCASE_PATH);
    for (File file : srcDir.listFiles()) {
        String fileName = file.getName().replace("Rgaa30Rule", "").replace(".java", "");
        String theme = fileName.substring(0, 2);
        String crit = fileName.substring(2, 4);
        String test = fileName.substring(4, 6);
        String testKey = Integer.valueOf(theme).toString()+"-"+Integer.valueOf(crit).toString()+"-"+Integer.valueOf(test).toString();
        String wrongKey = theme+"."+crit+"."+test;
        for (File testcase : file.listFiles()) {
            if (testcase.isFile() && testcase.getName().contains("html")) {
                Document doc = Jsoup.parse(FileUtils.readFileToString(testcase));
                Element detail = doc.select(".test-detail").first();
                if (detail == null) {
                    System.out.println(doc.outerHtml());
                } else {
                    detail.tagName("div");
                    detail.text("");
                    for (Element el : detail.children()) {
                        el.remove();
                    }
                    if (!detail.hasAttr("lang")) {
                        detail.attr("lang", "fr");
                    }
                    detail.append("\n"+RGAA3.get(testKey).ruleRawHtml+"\n");
                    doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
                    doc.outputSettings().outline(false);
                    doc.outputSettings().indentAmount(4);
                    String outputHtml = doc.outerHtml();
                    if (outputHtml.contains(wrongKey)) {
                        outputHtml = outputHtml.replaceAll(wrongKey, RGAA3.get(testKey).getRuleDot());
                    }
                    FileUtils.writeStringToFile(testcase, outputHtml);
                }
            }
        }
    }
}
 
Example #8
Source File: HTMLJsoupCleanerImpl.java    From Asqatasun with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
 public void run() {
     dirtyHTML = removeBadNamespaceDefinition(dirtyHTML);
     Document doc = Jsoup.parse(dirtyHTML);
     doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
     doc.outputSettings().outline(true);
     doc.outputSettings().indentAmount(2);
     removeComments(doc);
     removeMalformedAttributes(doc);
     result = doc.outerHtml();
}
 
Example #9
Source File: AnalyzeRule.java    From MyBookshelf with GNU General Public License v3.0 5 votes vote down vote up
public String getString(List<SourceRule> ruleList, boolean isUrl) throws Exception {
    Object result = null;
    if (!ruleList.isEmpty()) result = object;
    for (SourceRule rule : ruleList) {
        if (!StringUtils.isTrimEmpty(rule.rule)) {
            switch (rule.mode) {
                case Js:
                    result = evalJS(rule.rule, result);
                    break;
                case JSon:
                    result = getAnalyzeByJSonPath(result).getString(rule.rule);
                    break;
                case XPath:
                    result = getAnalyzeByXPath(result).getString(rule.rule);
                    break;
                case Default:
                    if (isUrl && !isEmpty(baseUrl)) {
                        result = getAnalyzeByJSoup(result).getString0(rule.rule);
                    } else {
                        result = getAnalyzeByJSoup(result).getString(rule.rule);
                    }
            }
        }
        if (!isEmpty(rule.replaceRegex)) {
            result = replaceRegex(String.valueOf(result), rule);
        }
    }
    if (result == null) return "";
    if (isUrl && !StringUtils.isTrimEmpty(baseUrl)) {
        return NetworkUtils.getAbsoluteURL(baseUrl, Entities.unescape(String.valueOf(result)));
    }
    try {
        return Entities.unescape(String.valueOf(result));
    } catch (Exception e) {
        return String.valueOf(result);
    }
}
 
Example #10
Source File: HtmlToDOCDemo.java    From docx4j-template with Apache License 2.0 5 votes vote down vote up
private static List<Object> convertToWmlObject(
		WordprocessingMLPackage wordMLPackage, String content)
		throws Docx4JException, JAXBException {
	MainDocumentPart document = wordMLPackage.getMainDocumentPart();
	//获取Jsoup参数
	String charsetName = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_CONVERT_OUT_WMLTEMPLATE_CHARSETNAME, Docx4jConstants.DEFAULT_CHARSETNAME );
	
	List<Object> wmlObjList = null;
	String templateString = XmlUtils.marshaltoString(document.getContents().getBody());
	System.out.println(templateString);
	Body templateBody = document.getContents().getBody();
	try {
		document.getContents().setBody(XmlUtils.deepCopy(templateBody));
		document.getContent().clear();
		Document doc = Jsoup.parse(content);
		doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml);
		//XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wordMLPackage);
		
		AlternativeFormatInputPart  part = document.addAltChunk(AltChunkType.Xhtml,doc.html().getBytes(Charset.forName(charsetName)));
		
		WordprocessingMLPackage tempPackage = document.convertAltChunks();
		File file = new File("d://temp.docx");
		tempPackage.save(file);
		wmlObjList = document.getContent();
		//part.getOwningRelationshipPart().getSourceP().get
		//wmlObjList = xhtmlImporter.convert(doc.html(), doc.baseUri());
	} finally {
		document.getContents().setBody(templateBody);
	}
	return wmlObjList;
}
 
Example #11
Source File: HtmlParserTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void relaxedBaseEntityMatchAndStrictExtendedMatch() {
    // extended entities need a ; at the end to match, base does not
    String html = "&amp &quot &reg &icy &hopf &icy; &hopf;";
    Document doc = Jsoup.parse(html);
    doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii"); // modifies output only to clarify test
    assertEquals("&amp; \" &reg; &amp;icy &amp;hopf &icy; &hopf;", doc.body().html());
}
 
Example #12
Source File: HtmlConverter.java    From docx4j-template with Apache License 2.0 5 votes vote down vote up
/**
 * 将页面转为{@link org.jsoup.nodes.Document}对象,xhtml 格式
 *
 * @param url
 * @return
 * @throws Exception
 */
protected Document url2xhtml(String url) throws Exception {
    Document doc = Jsoup.connect(url).get(); //获得

    if (logger.isDebugEnabled()) {
        logger.debug("baseUri: {}", doc.baseUri());
    }

    for (Element script : doc.getElementsByTag("script")) { //除去所有 script
        script.remove();
    }

    for (Element a : doc.getElementsByTag("a")) { //除去 a 的 onclick,href 属性
        a.removeAttr("onclick");
        a.removeAttr("href");
    }

    Elements links = doc.getElementsByTag("link"); //将link中的地址替换为绝对地址
    for (Element element : links) {
        String href = element.absUrl("href");

        if (logger.isDebugEnabled()) {
            logger.debug("href: {} -> {}", element.attr("href"), href);
        }

        element.attr("href", href);
    }

    doc.outputSettings()
            .syntax(Document.OutputSettings.Syntax.xml)
            .escapeMode(Entities.EscapeMode.xhtml);  //转为 xhtml 格式

    if (logger.isDebugEnabled()) {
        String[] split = doc.html().split("\n");
        for (int c = 0; c < split.length; c++) {
            logger.debug("line {}:\t{}", c + 1, split[c]);
        }
    }
    return doc;
}
 
Example #13
Source File: HtmlParserTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void relaxedBaseEntityMatchAndStrictExtendedMatch() {
    // extended entities need a ; at the end to match, base does not
    String html = "&amp &quot &reg &icy &hopf &icy; &hopf;";
    Document doc = Jsoup.parse(html);
    doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii"); // modifies output only to clarify test
    assertEquals("&amp; \" &reg; &amp;icy &amp;hopf &icy; &hopf;", doc.body().html());
}
 
Example #14
Source File: HtmlImportProcessorImpl.java    From yarg with Apache License 2.0 5 votes vote down vote up
@Override
public String processHtml(String source) {
    org.jsoup.nodes.Document document = Jsoup.parse(source);
    processHtmlDocument(document);
    document.outputSettings()
            .syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml)
            .prettyPrint(false)
            .escapeMode(Entities.EscapeMode.xhtml);


    return document.html();
}
 
Example #15
Source File: XHTMLImporterUtils.java    From docx4j-template with Apache License 2.0 5 votes vote down vote up
public static WordprocessingMLPackage handle(WordprocessingMLPackage wmlPackage, Document doc,boolean fragment,boolean altChunk) throws IOException, Docx4JException {
	//设置转换模式
	doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml);  //转为 xhtml 格式
	
	if(altChunk){
		//Document对象
		MainDocumentPart document = wmlPackage.getMainDocumentPart();
		//获取Jsoup参数
		String charsetName = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_CHARSETNAME, Docx4jConstants.DEFAULT_CHARSETNAME );
		//设置转换模式
		doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml);  //转为 xhtml 格式
		//创建html导入对象
		//XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wordMLPackage);
		document.addAltChunk(AltChunkType.Xhtml, (fragment ? doc.body().html() : doc.html()) .getBytes(Charset.forName(charsetName)));
		//document.addAltChunk(type, bytes, attachmentPoint)
		//document.addAltChunk(type, is)
		//document.addAltChunk(type, is, attachmentPoint)
		WordprocessingMLPackage tempPackage = document.convertAltChunks();
		
		//返回处理后的WordprocessingMLPackage对象
		return tempPackage;
	}
	
	//创建html导入对象
	XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wmlPackage);
	//将xhtml转换为wmlPackage可用的对象
	List<Object> list = xhtmlImporter.convert((fragment ? doc.body().html() : doc.html()), doc.baseUri());
	//导入转换后的内容对象
	wmlPackage.getMainDocumentPart().getContent().addAll(list);
	//返回原WordprocessingMLPackage对象
	return wmlPackage;
}
 
Example #16
Source File: Tokeniser.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
    if (reader.isEmpty())
        return null;
    if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
        return null;
    if (reader.matchesAnySorted(notCharRefCharsSorted))
        return null;

    final char[] charRef = charRefHolder;
    reader.mark();
    if (reader.matchConsume("#")) { // numbered
        boolean isHexMode = reader.matchConsumeIgnoreCase("X");
        String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
        if (numRef.length() == 0) { // didn't match anything
            characterReferenceError("numeric reference with no numerals");
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int charval = -1;
        try {
            int base = isHexMode ? 16 : 10;
            charval = Integer.valueOf(numRef, base);
        } catch (NumberFormatException e) {
        } // skip
        if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
            characterReferenceError("character outside of valid range");
            charRef[0] = replacementChar;
            return charRef;
        } else {
            // todo: implement number replacement table
            // todo: check for extra illegal unicode points as parse errors
            if (charval < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
                charRef[0] = (char) charval;
                return charRef;
            } else
            return Character.toChars(charval);
        }
    } else { // named
        // get as many letters as possible, and look for matching entities.
        String nameRef = reader.consumeLetterThenDigitSequence();
        boolean looksLegit = reader.matches(';');
        // found if a base named entity without a ;, or an extended entity with the ;.
        boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));

        if (!found) {
            reader.rewindToMark();
            if (looksLegit) // named with semicolon
                characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
            return null;
        }
        if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
            // don't want that to match
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        charRef[0] = Entities.getCharacterByName(nameRef);
        return charRef;
    }
}
 
Example #17
Source File: Tokeniser.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
    if (reader.isEmpty())
        return null;
    if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
        return null;
    if (reader.matchesAnySorted(notCharRefCharsSorted))
        return null;

    final int[] codeRef = codepointHolder;
    reader.mark();
    if (reader.matchConsume("#")) { // numbered
        boolean isHexMode = reader.matchConsumeIgnoreCase("X");
        String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
        if (numRef.length() == 0) { // didn't match anything
            characterReferenceError("numeric reference with no numerals");
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int charval = -1;
        try {
            int base = isHexMode ? 16 : 10;
            charval = Integer.valueOf(numRef, base);
        } catch (NumberFormatException ignored) {
        } // skip
        if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
            characterReferenceError("character outside of valid range");
            codeRef[0] = replacementChar;
            return codeRef;
        } else {
            // todo: implement number replacement table
            // todo: check for extra illegal unicode points as parse errors
            codeRef[0] = charval;
            return codeRef;
        }
    } else { // named
        // get as many letters as possible, and look for matching entities.
        String nameRef = reader.consumeLetterThenDigitSequence();
        boolean looksLegit = reader.matches(';');
        // found if a base named entity without a ;, or an extended entity with the ;.
        boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));

        if (!found) {
            reader.rewindToMark();
            if (looksLegit) // named with semicolon
                characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
            return null;
        }
        if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
            // don't want that to match
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int numChars = Entities.codepointsForName(nameRef, multipointHolder);
        if (numChars == 1) {
            codeRef[0] = multipointHolder[0];
            return codeRef;
        } else if (numChars ==2) {
            return multipointHolder;
        } else {
            Validate.fail("Unexpected characters returned for " + nameRef);
            return multipointHolder;
        }
    }
}
 
Example #18
Source File: Tokeniser.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
    if (reader.isEmpty())
        return null;
    if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
        return null;
    if (reader.matchesAnySorted(notCharRefCharsSorted))
        return null;

    final int[] codeRef = codepointHolder;
    reader.mark();
    if (reader.matchConsume("#")) { // numbered
        boolean isHexMode = reader.matchConsumeIgnoreCase("X");
        String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
        if (numRef.length() == 0) { // didn't match anything
            characterReferenceError("numeric reference with no numerals");
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int charval = -1;
        try {
            int base = isHexMode ? 16 : 10;
            charval = Integer.valueOf(numRef, base);
        } catch (NumberFormatException ignored) {
        } // skip
        if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
            characterReferenceError("character outside of valid range");
            codeRef[0] = replacementChar;
            return codeRef;
        } else {
            // todo: implement number replacement table
            // todo: check for extra illegal unicode points as parse errors
            codeRef[0] = charval;
            return codeRef;
        }
    } else { // named
        // get as many letters as possible, and look for matching entities.
        String nameRef = reader.consumeLetterThenDigitSequence();
        boolean looksLegit = reader.matches(';');
        // found if a base named entity without a ;, or an extended entity with the ;.
        boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));

        if (!found) {
            reader.rewindToMark();
            if (looksLegit) // named with semicolon
                characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
            return null;
        }
        if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
            // don't want that to match
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int numChars = Entities.codepointsForName(nameRef, multipointHolder);
        if (numChars == 1) {
            codeRef[0] = multipointHolder[0];
            return codeRef;
        } else if (numChars ==2) {
            return multipointHolder;
        } else {
            Validate.fail("Unexpected characters returned for " + nameRef);
            return multipointHolder;
        }
    }
}
 
Example #19
Source File: Tokeniser.java    From jsoup-learning with MIT License 4 votes vote down vote up
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
    if (reader.isEmpty())
        return null;
    if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
        return null;
    if (reader.matchesAny('\t', '\n', '\r', '\f', ' ', '<', '&'))
        return null;

    reader.mark();
    if (reader.matchConsume("#")) { // numbered
        boolean isHexMode = reader.matchConsumeIgnoreCase("X");
        String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
        if (numRef.length() == 0) { // didn't match anything
            characterReferenceError("numeric reference with no numerals");
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int charval = -1;
        try {
            int base = isHexMode ? 16 : 10;
            charval = Integer.valueOf(numRef, base);
        } catch (NumberFormatException e) {
        } // skip
        if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
            characterReferenceError("character outside of valid range");
            return new char[]{replacementChar};
        } else {
            // todo: implement number replacement table
            // todo: check for extra illegal unicode points as parse errors
            return Character.toChars(charval);
        }
    } else { // named
        // get as many letters as possible, and look for matching entities.
        String nameRef = reader.consumeLetterThenDigitSequence();
        boolean looksLegit = reader.matches(';');
        // found if a base named entity without a ;, or an extended entity with the ;.
        boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));

        if (!found) {
            reader.rewindToMark();
            if (looksLegit) // named with semicolon
                characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
            return null;
        }
        if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
            // don't want that to match
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        return new char[]{Entities.getCharacterByName(nameRef)};
    }
}