org.jsoup.nodes.Entities Java Examples
The following examples show how to use
org.jsoup.nodes.Entities.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CleanerTest.java From astor with GNU General Public License v2.0 | 6 votes |
@Test public void supplyOutputSettings() { // test that one can override the default document output settings Document.OutputSettings os = new Document.OutputSettings(); os.prettyPrint(false); os.escapeMode(Entities.EscapeMode.extended); os.charset("ascii"); String html = "<div><p>ℬ</p></div>"; String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os); String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed()); assertNotSame(defaultOut, customOut); assertEquals("<div><p>ℬ</p></div>", customOut); assertEquals("<div>\n" + " <p>ℬ</p>\n" + "</div>", defaultOut); os.charset("ASCII"); os.escapeMode(Entities.EscapeMode.base); String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os); assertEquals("<div><p>ℬ</p></div>", customOut2); }
Example #2
Source File: CleanerTest.java From jsoup-learning with MIT License | 6 votes |
@Test public void supplyOutputSettings() { // test that one can override the default document output settings Document.OutputSettings os = new Document.OutputSettings(); os.prettyPrint(false); os.escapeMode(Entities.EscapeMode.extended); String html = "<div><p>ℬ</p></div>"; String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os); String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed()); assertNotSame(defaultOut, customOut); assertEquals("<div><p>ℬ</p></div>", customOut); assertEquals("<div>\n" + " <p>ℬ</p>\n" + "</div>", defaultOut); os.charset("ASCII"); os.escapeMode(Entities.EscapeMode.base); String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os); assertEquals("<div><p>ℬ</p></div>", customOut2); }
Example #3
Source File: CleanerTest.java From astor with GNU General Public License v2.0 | 6 votes |
@Test public void supplyOutputSettings() { // test that one can override the default document output settings Document.OutputSettings os = new Document.OutputSettings(); os.prettyPrint(false); os.escapeMode(Entities.EscapeMode.extended); os.charset("ascii"); String html = "<div><p>ℬ</p></div>"; String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os); String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed()); assertNotSame(defaultOut, customOut); assertEquals("<div><p>ℬ</p></div>", customOut); // entities now prefers shorted names if aliased assertEquals("<div>\n" + " <p>ℬ</p>\n" + "</div>", defaultOut); os.charset("ASCII"); os.escapeMode(Entities.EscapeMode.base); String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os); assertEquals("<div><p>ℬ</p></div>", customOut2); }
Example #4
Source File: CleanerTest.java From astor with GNU General Public License v2.0 | 6 votes |
@Test public void supplyOutputSettings() { // test that one can override the default document output settings Document.OutputSettings os = new Document.OutputSettings(); os.prettyPrint(false); os.escapeMode(Entities.EscapeMode.extended); os.charset("ascii"); String html = "<div><p>ℬ</p></div>"; String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os); String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed()); assertNotSame(defaultOut, customOut); assertEquals("<div><p>ℬ</p></div>", customOut); // entities now prefers shorted names if aliased assertEquals("<div>\n" + " <p>ℬ</p>\n" + "</div>", defaultOut); os.charset("ASCII"); os.escapeMode(Entities.EscapeMode.base); String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os); assertEquals("<div><p>ℬ</p></div>", customOut2); }
Example #5
Source File: Html.java From zongtui-webcrawler with GNU General Public License v2.0 | 5 votes |
/** * Disable jsoup html entity escape. It is a hack way only for jsoup 1.7.2. */ private void disableJsoupHtmlEntityEscape() { if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) { Entities.EscapeMode.base.getMap().clear(); Entities.EscapeMode.extended.getMap().clear(); INITED = true; } }
Example #6
Source File: HerdStringUtils.java From herd with Apache License 2.0 | 5 votes |
/** * Strips HTML tags from a given input String, allows some tags to be retained via a whitelist * * @param fragment the specified String * @param whitelistTags the specified whitelist tags * * @return cleaned String with allowed tags */ public static String stripHtml(String fragment, String... whitelistTags) { // Unescape HTML. String unEscapedFragment = StringEscapeUtils.unescapeHtml4(fragment); // Parse out html tags except those from a given list of whitelist tags Document dirty = Jsoup.parseBodyFragment(unEscapedFragment); Whitelist whitelist = new Whitelist(); for (String whitelistTag : whitelistTags) { // Get the actual tag name from the whitelist tag // this is vulnerable in general to complex tags but will suffice for our simple needs whitelistTag = StringUtils.removePattern(whitelistTag, "[^\\{IsAlphabetic}]"); // Add all specified tags to the whitelist while preserving inline css whitelist.addTags(whitelistTag).addAttributes(whitelistTag, "class"); } Cleaner cleaner = new Cleaner(whitelist); Document clean = cleaner.clean(dirty); // Set character encoding to UTF-8 and make sure no line-breaks are added clean.outputSettings().escapeMode(Entities.EscapeMode.base).charset(StandardCharsets.UTF_8).prettyPrint(false); // return 'cleaned' html body return clean.body().html(); }
Example #7
Source File: Rgaa3Extractor.java From Asqatasun with GNU Affero General Public License v3.0 | 5 votes |
private static void createTestcaseFiles() throws IOException { File srcDir = new File(RGAA3_TESTCASE_PATH); for (File file : srcDir.listFiles()) { String fileName = file.getName().replace("Rgaa30Rule", "").replace(".java", ""); String theme = fileName.substring(0, 2); String crit = fileName.substring(2, 4); String test = fileName.substring(4, 6); String testKey = Integer.valueOf(theme).toString()+"-"+Integer.valueOf(crit).toString()+"-"+Integer.valueOf(test).toString(); String wrongKey = theme+"."+crit+"."+test; for (File testcase : file.listFiles()) { if (testcase.isFile() && testcase.getName().contains("html")) { Document doc = Jsoup.parse(FileUtils.readFileToString(testcase)); Element detail = doc.select(".test-detail").first(); if (detail == null) { System.out.println(doc.outerHtml()); } else { detail.tagName("div"); detail.text(""); for (Element el : detail.children()) { el.remove(); } if (!detail.hasAttr("lang")) { detail.attr("lang", "fr"); } detail.append("\n"+RGAA3.get(testKey).ruleRawHtml+"\n"); doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); doc.outputSettings().outline(false); doc.outputSettings().indentAmount(4); String outputHtml = doc.outerHtml(); if (outputHtml.contains(wrongKey)) { outputHtml = outputHtml.replaceAll(wrongKey, RGAA3.get(testKey).getRuleDot()); } FileUtils.writeStringToFile(testcase, outputHtml); } } } } }
Example #8
Source File: HTMLJsoupCleanerImpl.java From Asqatasun with GNU Affero General Public License v3.0 | 5 votes |
@Override public void run() { dirtyHTML = removeBadNamespaceDefinition(dirtyHTML); Document doc = Jsoup.parse(dirtyHTML); doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); doc.outputSettings().outline(true); doc.outputSettings().indentAmount(2); removeComments(doc); removeMalformedAttributes(doc); result = doc.outerHtml(); }
Example #9
Source File: AnalyzeRule.java From MyBookshelf with GNU General Public License v3.0 | 5 votes |
public String getString(List<SourceRule> ruleList, boolean isUrl) throws Exception { Object result = null; if (!ruleList.isEmpty()) result = object; for (SourceRule rule : ruleList) { if (!StringUtils.isTrimEmpty(rule.rule)) { switch (rule.mode) { case Js: result = evalJS(rule.rule, result); break; case JSon: result = getAnalyzeByJSonPath(result).getString(rule.rule); break; case XPath: result = getAnalyzeByXPath(result).getString(rule.rule); break; case Default: if (isUrl && !isEmpty(baseUrl)) { result = getAnalyzeByJSoup(result).getString0(rule.rule); } else { result = getAnalyzeByJSoup(result).getString(rule.rule); } } } if (!isEmpty(rule.replaceRegex)) { result = replaceRegex(String.valueOf(result), rule); } } if (result == null) return ""; if (isUrl && !StringUtils.isTrimEmpty(baseUrl)) { return NetworkUtils.getAbsoluteURL(baseUrl, Entities.unescape(String.valueOf(result))); } try { return Entities.unescape(String.valueOf(result)); } catch (Exception e) { return String.valueOf(result); } }
Example #10
Source File: HtmlToDOCDemo.java From docx4j-template with Apache License 2.0 | 5 votes |
private static List<Object> convertToWmlObject( WordprocessingMLPackage wordMLPackage, String content) throws Docx4JException, JAXBException { MainDocumentPart document = wordMLPackage.getMainDocumentPart(); //获取Jsoup参数 String charsetName = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_CONVERT_OUT_WMLTEMPLATE_CHARSETNAME, Docx4jConstants.DEFAULT_CHARSETNAME ); List<Object> wmlObjList = null; String templateString = XmlUtils.marshaltoString(document.getContents().getBody()); System.out.println(templateString); Body templateBody = document.getContents().getBody(); try { document.getContents().setBody(XmlUtils.deepCopy(templateBody)); document.getContent().clear(); Document doc = Jsoup.parse(content); doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml); //XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wordMLPackage); AlternativeFormatInputPart part = document.addAltChunk(AltChunkType.Xhtml,doc.html().getBytes(Charset.forName(charsetName))); WordprocessingMLPackage tempPackage = document.convertAltChunks(); File file = new File("d://temp.docx"); tempPackage.save(file); wmlObjList = document.getContent(); //part.getOwningRelationshipPart().getSourceP().get //wmlObjList = xhtmlImporter.convert(doc.html(), doc.baseUri()); } finally { document.getContents().setBody(templateBody); } return wmlObjList; }
Example #11
Source File: HtmlParserTest.java From astor with GNU General Public License v2.0 | 5 votes |
@Test public void relaxedBaseEntityMatchAndStrictExtendedMatch() { // extended entities need a ; at the end to match, base does not String html = "& " ® &icy &hopf и 𝕙"; Document doc = Jsoup.parse(html); doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii"); // modifies output only to clarify test assertEquals("& \" ® &icy &hopf и 𝕙", doc.body().html()); }
Example #12
Source File: HtmlConverter.java From docx4j-template with Apache License 2.0 | 5 votes |
/** * 将页面转为{@link org.jsoup.nodes.Document}对象,xhtml 格式 * * @param url * @return * @throws Exception */ protected Document url2xhtml(String url) throws Exception { Document doc = Jsoup.connect(url).get(); //获得 if (logger.isDebugEnabled()) { logger.debug("baseUri: {}", doc.baseUri()); } for (Element script : doc.getElementsByTag("script")) { //除去所有 script script.remove(); } for (Element a : doc.getElementsByTag("a")) { //除去 a 的 onclick,href 属性 a.removeAttr("onclick"); a.removeAttr("href"); } Elements links = doc.getElementsByTag("link"); //将link中的地址替换为绝对地址 for (Element element : links) { String href = element.absUrl("href"); if (logger.isDebugEnabled()) { logger.debug("href: {} -> {}", element.attr("href"), href); } element.attr("href", href); } doc.outputSettings() .syntax(Document.OutputSettings.Syntax.xml) .escapeMode(Entities.EscapeMode.xhtml); //转为 xhtml 格式 if (logger.isDebugEnabled()) { String[] split = doc.html().split("\n"); for (int c = 0; c < split.length; c++) { logger.debug("line {}:\t{}", c + 1, split[c]); } } return doc; }
Example #13
Source File: HtmlParserTest.java From astor with GNU General Public License v2.0 | 5 votes |
@Test public void relaxedBaseEntityMatchAndStrictExtendedMatch() { // extended entities need a ; at the end to match, base does not String html = "& " ® &icy &hopf и 𝕙"; Document doc = Jsoup.parse(html); doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii"); // modifies output only to clarify test assertEquals("& \" ® &icy &hopf и 𝕙", doc.body().html()); }
Example #14
Source File: HtmlImportProcessorImpl.java From yarg with Apache License 2.0 | 5 votes |
@Override public String processHtml(String source) { org.jsoup.nodes.Document document = Jsoup.parse(source); processHtmlDocument(document); document.outputSettings() .syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml) .prettyPrint(false) .escapeMode(Entities.EscapeMode.xhtml); return document.html(); }
Example #15
Source File: XHTMLImporterUtils.java From docx4j-template with Apache License 2.0 | 5 votes |
public static WordprocessingMLPackage handle(WordprocessingMLPackage wmlPackage, Document doc,boolean fragment,boolean altChunk) throws IOException, Docx4JException { //设置转换模式 doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml); //转为 xhtml 格式 if(altChunk){ //Document对象 MainDocumentPart document = wmlPackage.getMainDocumentPart(); //获取Jsoup参数 String charsetName = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_CHARSETNAME, Docx4jConstants.DEFAULT_CHARSETNAME ); //设置转换模式 doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml); //转为 xhtml 格式 //创建html导入对象 //XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wordMLPackage); document.addAltChunk(AltChunkType.Xhtml, (fragment ? doc.body().html() : doc.html()) .getBytes(Charset.forName(charsetName))); //document.addAltChunk(type, bytes, attachmentPoint) //document.addAltChunk(type, is) //document.addAltChunk(type, is, attachmentPoint) WordprocessingMLPackage tempPackage = document.convertAltChunks(); //返回处理后的WordprocessingMLPackage对象 return tempPackage; } //创建html导入对象 XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wmlPackage); //将xhtml转换为wmlPackage可用的对象 List<Object> list = xhtmlImporter.convert((fragment ? doc.body().html() : doc.html()), doc.baseUri()); //导入转换后的内容对象 wmlPackage.getMainDocumentPart().getContent().addAll(list); //返回原WordprocessingMLPackage对象 return wmlPackage; }
Example #16
Source File: Tokeniser.java From astor with GNU General Public License v2.0 | 4 votes |
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { if (reader.isEmpty()) return null; if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) return null; if (reader.matchesAnySorted(notCharRefCharsSorted)) return null; final char[] charRef = charRefHolder; reader.mark(); if (reader.matchConsume("#")) { // numbered boolean isHexMode = reader.matchConsumeIgnoreCase("X"); String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); if (numRef.length() == 0) { // didn't match anything characterReferenceError("numeric reference with no numerals"); reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi int charval = -1; try { int base = isHexMode ? 16 : 10; charval = Integer.valueOf(numRef, base); } catch (NumberFormatException e) { } // skip if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { characterReferenceError("character outside of valid range"); charRef[0] = replacementChar; return charRef; } else { // todo: implement number replacement table // todo: check for extra illegal unicode points as parse errors if (charval < Character.MIN_SUPPLEMENTARY_CODE_POINT) { charRef[0] = (char) charval; return charRef; } else return Character.toChars(charval); } } else { // named // get as many letters as possible, and look for matching entities. String nameRef = reader.consumeLetterThenDigitSequence(); boolean looksLegit = reader.matches(';'); // found if a base named entity without a ;, or an extended entity with the ;. boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit)); if (!found) { reader.rewindToMark(); if (looksLegit) // named with semicolon characterReferenceError(String.format("invalid named referenece '%s'", nameRef)); return null; } if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { // don't want that to match reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi charRef[0] = Entities.getCharacterByName(nameRef); return charRef; } }
Example #17
Source File: Tokeniser.java From astor with GNU General Public License v2.0 | 4 votes |
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { if (reader.isEmpty()) return null; if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) return null; if (reader.matchesAnySorted(notCharRefCharsSorted)) return null; final int[] codeRef = codepointHolder; reader.mark(); if (reader.matchConsume("#")) { // numbered boolean isHexMode = reader.matchConsumeIgnoreCase("X"); String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); if (numRef.length() == 0) { // didn't match anything characterReferenceError("numeric reference with no numerals"); reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi int charval = -1; try { int base = isHexMode ? 16 : 10; charval = Integer.valueOf(numRef, base); } catch (NumberFormatException ignored) { } // skip if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { characterReferenceError("character outside of valid range"); codeRef[0] = replacementChar; return codeRef; } else { // todo: implement number replacement table // todo: check for extra illegal unicode points as parse errors codeRef[0] = charval; return codeRef; } } else { // named // get as many letters as possible, and look for matching entities. String nameRef = reader.consumeLetterThenDigitSequence(); boolean looksLegit = reader.matches(';'); // found if a base named entity without a ;, or an extended entity with the ;. boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit)); if (!found) { reader.rewindToMark(); if (looksLegit) // named with semicolon characterReferenceError(String.format("invalid named referenece '%s'", nameRef)); return null; } if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { // don't want that to match reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi int numChars = Entities.codepointsForName(nameRef, multipointHolder); if (numChars == 1) { codeRef[0] = multipointHolder[0]; return codeRef; } else if (numChars ==2) { return multipointHolder; } else { Validate.fail("Unexpected characters returned for " + nameRef); return multipointHolder; } } }
Example #18
Source File: Tokeniser.java From astor with GNU General Public License v2.0 | 4 votes |
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { if (reader.isEmpty()) return null; if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) return null; if (reader.matchesAnySorted(notCharRefCharsSorted)) return null; final int[] codeRef = codepointHolder; reader.mark(); if (reader.matchConsume("#")) { // numbered boolean isHexMode = reader.matchConsumeIgnoreCase("X"); String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); if (numRef.length() == 0) { // didn't match anything characterReferenceError("numeric reference with no numerals"); reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi int charval = -1; try { int base = isHexMode ? 16 : 10; charval = Integer.valueOf(numRef, base); } catch (NumberFormatException ignored) { } // skip if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { characterReferenceError("character outside of valid range"); codeRef[0] = replacementChar; return codeRef; } else { // todo: implement number replacement table // todo: check for extra illegal unicode points as parse errors codeRef[0] = charval; return codeRef; } } else { // named // get as many letters as possible, and look for matching entities. String nameRef = reader.consumeLetterThenDigitSequence(); boolean looksLegit = reader.matches(';'); // found if a base named entity without a ;, or an extended entity with the ;. boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit)); if (!found) { reader.rewindToMark(); if (looksLegit) // named with semicolon characterReferenceError(String.format("invalid named referenece '%s'", nameRef)); return null; } if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { // don't want that to match reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi int numChars = Entities.codepointsForName(nameRef, multipointHolder); if (numChars == 1) { codeRef[0] = multipointHolder[0]; return codeRef; } else if (numChars ==2) { return multipointHolder; } else { Validate.fail("Unexpected characters returned for " + nameRef); return multipointHolder; } } }
Example #19
Source File: Tokeniser.java From jsoup-learning with MIT License | 4 votes |
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { if (reader.isEmpty()) return null; if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) return null; if (reader.matchesAny('\t', '\n', '\r', '\f', ' ', '<', '&')) return null; reader.mark(); if (reader.matchConsume("#")) { // numbered boolean isHexMode = reader.matchConsumeIgnoreCase("X"); String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); if (numRef.length() == 0) { // didn't match anything characterReferenceError("numeric reference with no numerals"); reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi int charval = -1; try { int base = isHexMode ? 16 : 10; charval = Integer.valueOf(numRef, base); } catch (NumberFormatException e) { } // skip if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { characterReferenceError("character outside of valid range"); return new char[]{replacementChar}; } else { // todo: implement number replacement table // todo: check for extra illegal unicode points as parse errors return Character.toChars(charval); } } else { // named // get as many letters as possible, and look for matching entities. String nameRef = reader.consumeLetterThenDigitSequence(); boolean looksLegit = reader.matches(';'); // found if a base named entity without a ;, or an extended entity with the ;. boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit)); if (!found) { reader.rewindToMark(); if (looksLegit) // named with semicolon characterReferenceError(String.format("invalid named referenece '%s'", nameRef)); return null; } if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { // don't want that to match reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi return new char[]{Entities.getCharacterByName(nameRef)}; } }