Java Code Examples for org.jsoup.nodes.Entities#isNamedEntity()

The following examples show how to use org.jsoup.nodes.Entities#isNamedEntity() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Tokeniser.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
    if (reader.isEmpty())
        return null;
    if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
        return null;
    if (reader.matchesAnySorted(notCharRefCharsSorted))
        return null;

    final char[] charRef = charRefHolder;
    reader.mark();
    if (reader.matchConsume("#")) { // numbered
        boolean isHexMode = reader.matchConsumeIgnoreCase("X");
        String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
        if (numRef.length() == 0) { // didn't match anything
            characterReferenceError("numeric reference with no numerals");
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int charval = -1;
        try {
            int base = isHexMode ? 16 : 10;
            charval = Integer.valueOf(numRef, base);
        } catch (NumberFormatException e) {
        } // skip
        if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
            characterReferenceError("character outside of valid range");
            charRef[0] = replacementChar;
            return charRef;
        } else {
            // todo: implement number replacement table
            // todo: check for extra illegal unicode points as parse errors
            if (charval < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
                charRef[0] = (char) charval;
                return charRef;
            } else
            return Character.toChars(charval);
        }
    } else { // named
        // get as many letters as possible, and look for matching entities.
        String nameRef = reader.consumeLetterThenDigitSequence();
        boolean looksLegit = reader.matches(';');
        // found if a base named entity without a ;, or an extended entity with the ;.
        boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));

        if (!found) {
            reader.rewindToMark();
            if (looksLegit) // named with semicolon
                characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
            return null;
        }
        if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
            // don't want that to match
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        charRef[0] = Entities.getCharacterByName(nameRef);
        return charRef;
    }
}
 
Example 2
Source File: Tokeniser.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
    if (reader.isEmpty())
        return null;
    if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
        return null;
    if (reader.matchesAnySorted(notCharRefCharsSorted))
        return null;

    final int[] codeRef = codepointHolder;
    reader.mark();
    if (reader.matchConsume("#")) { // numbered
        boolean isHexMode = reader.matchConsumeIgnoreCase("X");
        String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
        if (numRef.length() == 0) { // didn't match anything
            characterReferenceError("numeric reference with no numerals");
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int charval = -1;
        try {
            int base = isHexMode ? 16 : 10;
            charval = Integer.valueOf(numRef, base);
        } catch (NumberFormatException ignored) {
        } // skip
        if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
            characterReferenceError("character outside of valid range");
            codeRef[0] = replacementChar;
            return codeRef;
        } else {
            // todo: implement number replacement table
            // todo: check for extra illegal unicode points as parse errors
            codeRef[0] = charval;
            return codeRef;
        }
    } else { // named
        // get as many letters as possible, and look for matching entities.
        String nameRef = reader.consumeLetterThenDigitSequence();
        boolean looksLegit = reader.matches(';');
        // found if a base named entity without a ;, or an extended entity with the ;.
        boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));

        if (!found) {
            reader.rewindToMark();
            if (looksLegit) // named with semicolon
                characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
            return null;
        }
        if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
            // don't want that to match
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int numChars = Entities.codepointsForName(nameRef, multipointHolder);
        if (numChars == 1) {
            codeRef[0] = multipointHolder[0];
            return codeRef;
        } else if (numChars ==2) {
            return multipointHolder;
        } else {
            Validate.fail("Unexpected characters returned for " + nameRef);
            return multipointHolder;
        }
    }
}
 
Example 3
Source File: Tokeniser.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
    if (reader.isEmpty())
        return null;
    if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
        return null;
    if (reader.matchesAnySorted(notCharRefCharsSorted))
        return null;

    final int[] codeRef = codepointHolder;
    reader.mark();
    if (reader.matchConsume("#")) { // numbered
        boolean isHexMode = reader.matchConsumeIgnoreCase("X");
        String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
        if (numRef.length() == 0) { // didn't match anything
            characterReferenceError("numeric reference with no numerals");
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int charval = -1;
        try {
            int base = isHexMode ? 16 : 10;
            charval = Integer.valueOf(numRef, base);
        } catch (NumberFormatException ignored) {
        } // skip
        if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
            characterReferenceError("character outside of valid range");
            codeRef[0] = replacementChar;
            return codeRef;
        } else {
            // todo: implement number replacement table
            // todo: check for extra illegal unicode points as parse errors
            codeRef[0] = charval;
            return codeRef;
        }
    } else { // named
        // get as many letters as possible, and look for matching entities.
        String nameRef = reader.consumeLetterThenDigitSequence();
        boolean looksLegit = reader.matches(';');
        // found if a base named entity without a ;, or an extended entity with the ;.
        boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));

        if (!found) {
            reader.rewindToMark();
            if (looksLegit) // named with semicolon
                characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
            return null;
        }
        if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
            // don't want that to match
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int numChars = Entities.codepointsForName(nameRef, multipointHolder);
        if (numChars == 1) {
            codeRef[0] = multipointHolder[0];
            return codeRef;
        } else if (numChars ==2) {
            return multipointHolder;
        } else {
            Validate.fail("Unexpected characters returned for " + nameRef);
            return multipointHolder;
        }
    }
}
 
Example 4
Source File: Tokeniser.java    From jsoup-learning with MIT License 4 votes vote down vote up
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
    if (reader.isEmpty())
        return null;
    if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
        return null;
    if (reader.matchesAny('\t', '\n', '\r', '\f', ' ', '<', '&'))
        return null;

    reader.mark();
    if (reader.matchConsume("#")) { // numbered
        boolean isHexMode = reader.matchConsumeIgnoreCase("X");
        String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
        if (numRef.length() == 0) { // didn't match anything
            characterReferenceError("numeric reference with no numerals");
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int charval = -1;
        try {
            int base = isHexMode ? 16 : 10;
            charval = Integer.valueOf(numRef, base);
        } catch (NumberFormatException e) {
        } // skip
        if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
            characterReferenceError("character outside of valid range");
            return new char[]{replacementChar};
        } else {
            // todo: implement number replacement table
            // todo: check for extra illegal unicode points as parse errors
            return Character.toChars(charval);
        }
    } else { // named
        // get as many letters as possible, and look for matching entities.
        String nameRef = reader.consumeLetterThenDigitSequence();
        boolean looksLegit = reader.matches(';');
        // found if a base named entity without a ;, or an extended entity with the ;.
        boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));

        if (!found) {
            reader.rewindToMark();
            if (looksLegit) // named with semicolon
                characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
            return null;
        }
        if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
            // don't want that to match
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        return new char[]{Entities.getCharacterByName(nameRef)};
    }
}