Java Code Examples for com.ibm.icu.text.UTF16#getCharCount()

The following examples show how to use com.ibm.icu.text.UTF16#getCharCount() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Utility.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
/**
 * Convert characters outside the range U+0020 to U+007F to
 * Unicode escapes, and convert backslash to a double backslash.
 */
public static final String escape(String s) {
    StringBuilder buf = new StringBuilder();
    for (int i=0; i<s.length(); ) {
        int c = Character.codePointAt(s, i);
        i += UTF16.getCharCount(c);
        if (c >= ' ' && c <= 0x007F) {
            if (c == '\\') {
                buf.append("\\\\"); // That is, "\\"
            } else {
                buf.append((char)c);
            }
        } else {
            boolean four = c <= 0xFFFF;
            buf.append(four ? "\\u" : "\\U");
            buf.append(hex(c, four ? 4 : 8));
        }
    }
    return buf.toString();
}
 
Example 2
Source File: Utility.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
/**
 * Convert a string to separated groups of hex uppercase
 * digits.  E.g., hex('ab'...) => "0041,0042".  Append the output
 * to the given Appendable.
 */
public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) {
    try {
        if (useCodePoints) {
            int cp;
            for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
                cp = Character.codePointAt(s, i);
                if (i != 0) {
                    result.append(separator);
                }
                result.append(hex(cp,width));
            }
        } else {
            for (int i = 0; i < s.length(); ++i) {
                if (i != 0) {
                    result.append(separator);
                }
                result.append(hex(s.charAt(i),width));
            }
        }
        return result;
    } catch (IOException e) {
        throw new IllegalIcuArgumentException(e);
    }
}
 
Example 3
Source File: Utility.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
/**
 * Parse a Unicode identifier from the given string at the given
 * position.  Return the identifier, or null if there is no
 * identifier.
 * @param str the string to parse
 * @param pos INPUT-OUPUT parameter.  On INPUT, pos[0] is the
 * first character to examine.  It must be less than str.length(),
 * and it must not point to a whitespace character.  That is, must
 * have pos[0] < str.length().  On
 * OUTPUT, the position after the last parsed character.
 * @return the Unicode identifier, or null if there is no valid
 * identifier at pos[0].
 */
public static String parseUnicodeIdentifier(String str, int[] pos) {
    // assert(pos[0] < str.length());
    StringBuilder buf = new StringBuilder();
    int p = pos[0];
    while (p < str.length()) {
        int ch = Character.codePointAt(str, p);
        if (buf.length() == 0) {
            if (UCharacter.isUnicodeIdentifierStart(ch)) {
                buf.appendCodePoint(ch);
            } else {
                return null;
            }
        } else {
            if (UCharacter.isUnicodeIdentifierPart(ch)) {
                buf.appendCodePoint(ch);
            } else {
                break;
            }
        }
        p += UTF16.getCharCount(ch);
    }
    pos[0] = p;
    return buf.toString();
}
 
Example 4
Source File: Utility.java    From trekarta with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Convert characters outside the range U+0020 to U+007F to
 * Unicode escapes, and convert backslash to a double backslash.
 */
public static final String escape(String s) {
    StringBuilder buf = new StringBuilder();
    for (int i=0; i<s.length(); ) {
        int c = Character.codePointAt(s, i);
        i += UTF16.getCharCount(c);
        if (c >= ' ' && c <= 0x007F) {
            if (c == '\\') {
                buf.append("\\\\"); // That is, "\\"
            } else {
                buf.append((char)c);
            }
        } else {
            boolean four = c <= 0xFFFF;
            buf.append(four ? "\\u" : "\\U");
            buf.append(hex(c, four ? 4 : 8));
        }
    }
    return buf.toString();
}
 
Example 5
Source File: Utility.java    From trekarta with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Convert a string to separated groups of hex uppercase
 * digits.  E.g., hex('ab'...) => "0041,0042".  Append the output
 * to the given Appendable.
 */
public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) {
    try {
        if (useCodePoints) {
            int cp;
            for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
                cp = Character.codePointAt(s, i);
                if (i != 0) {
                    result.append(separator);
                }
                result.append(hex(cp,width));
            }
        } else {
            for (int i = 0; i < s.length(); ++i) {
                if (i != 0) {
                    result.append(separator);
                }
                result.append(hex(s.charAt(i),width));
            }
        }
        return result;
    } catch (IOException e) {
        throw new IllegalIcuArgumentException(e);
    }
}
 
Example 6
Source File: Utility.java    From trekarta with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Parse a Unicode identifier from the given string at the given
 * position.  Return the identifier, or null if there is no
 * identifier.
 * @param str the string to parse
 * @param pos INPUT-OUPUT parameter.  On INPUT, pos[0] is the
 * first character to examine.  It must be less than str.length(),
 * and it must not point to a whitespace character.  That is, must
 * have pos[0] < str.length().  On
 * OUTPUT, the position after the last parsed character.
 * @return the Unicode identifier, or null if there is no valid
 * identifier at pos[0].
 */
public static String parseUnicodeIdentifier(String str, int[] pos) {
    // assert(pos[0] < str.length());
    StringBuilder buf = new StringBuilder();
    int p = pos[0];
    while (p < str.length()) {
        int ch = Character.codePointAt(str, p);
        if (buf.length() == 0) {
            if (UCharacter.isUnicodeIdentifierStart(ch)) {
                buf.appendCodePoint(ch);
            } else {
                return null;
            }
        } else {
            if (UCharacter.isUnicodeIdentifierPart(ch)) {
                buf.appendCodePoint(ch);
            } else {
                break;
            }
        }
        p += UTF16.getCharCount(ch);
    }
    pos[0] = p;
    return buf.toString();
}
 
Example 7
Source File: BreakIteratorWrapper.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
private int calcStatus(int current, int next) {
    if (current == BreakIterator.DONE || next == BreakIterator.DONE) {
        return RuleBasedBreakIterator.WORD_NONE;
    }
    int begin = start + current;
    int end = start + next;
    int codepoint;
    for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
        codepoint = UTF16.charAt(text, 0, end, begin);
        if (UCharacter.isDigit(codepoint)) {
            return RuleBasedBreakIterator.WORD_NUMBER;
        } else if (UCharacter.isLetter(codepoint)) {
            return RuleBasedBreakIterator.WORD_LETTER;
        }
    }
    return RuleBasedBreakIterator.WORD_NONE;
}
 
Example 8
Source File: ScriptIterator.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Iterates to the next script run, returning true if one exists.
 * 
 * @return true if there is another script run, false otherwise.
 */
boolean next() {
  if (scriptLimit >= limit)
    return false;

  scriptCode = UScript.COMMON;
  scriptStart = scriptLimit;

  while (index < limit) {
    final int ch = UTF16.charAt(text, start, limit, index - start);
    final int sc = getScript(ch);

    /*
     * From UTR #24: Implementations that determine the boundaries between
     * characters of given scripts should never break between a non-spacing
     * mark and its base character. Thus for boundary determinations and
     * similar sorts of processing, a non-spacing mark — whatever its script
     * value — should inherit the script value of its base character.
     */
    if (isSameScript(scriptCode, sc)
        || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
      index += UTF16.getCharCount(ch);

      /*
       * Inherited or Common becomes the script code of the surrounding text.
       */
      if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
        scriptCode = sc;
      }

    } else {
      break;
    }
  }

  scriptLimit = index;
  return true;
}
 
Example 9
Source File: ScriptIterator.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * Iterates to the next script run, returning true if one exists.
 *
 * @return true if there is another script run, false otherwise.
 */
boolean next() {
    if (scriptLimit >= limit) {
        return false;
    }
    scriptCode = UScript.COMMON;
    scriptStart = scriptLimit;
    while (index < limit) {
        final int ch = UTF16.charAt(text, start, limit, index - start);
        final int sc = getScript(ch);
        /*
         * From UTR #24: Implementations that determine the boundaries between
         * characters of given scripts should never break between a non-spacing
         * mark and its base character. Thus for boundary determinations and
         * similar sorts of processing, a non-spacing mark — whatever its script
         * value — should inherit the script value of its base character.
         */
        if (isSameScript(scriptCode, sc)
                || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
            index += UTF16.getCharCount(ch);
            /*
             * Inherited or Common becomes the script code of the surrounding text.
             */
            if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
                scriptCode = sc;
            }
        } else {
            break;
        }
    }
    scriptLimit = index;
    return true;
}
 
Example 10
Source File: UScriptRun.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
/**
 * Find the next script run. Returns <code>false</code> if there
 * isn't another run, returns <code>true</code> if there is.
 *
 * @return <code>false</code> if there isn't another run, <code>true</code> if there is.
 *
 * @internal
 * @deprecated This API is ICU internal only.
 */
@Deprecated
public final boolean next()
{
    // if we've fallen off the end of the text, we're done
    if (scriptLimit >= textLimit) {
        return false;
    }

    scriptCode  = UScript.COMMON;
    scriptStart = scriptLimit;
    
    syncFixup();
    
    while (textIndex < textLimit) {
        int ch = UTF16.charAt(text, textStart, textLimit, textIndex - textStart);
        int codePointCount = UTF16.getCharCount(ch);
        int sc = UScript.getScript(ch);
        int pairIndex = getPairIndex(ch);

        textIndex += codePointCount;
        
        // Paired character handling:
        //
        // if it's an open character, push it onto the stack.
        // if it's a close character, find the matching open on the
        // stack, and use that script code. Any non-matching open
        // characters above it on the stack will be poped.
        if (pairIndex >= 0) {
            if ((pairIndex & 1) == 0) {
                push(pairIndex, scriptCode);
            } else {
                int pi = pairIndex & ~1;

                while (stackIsNotEmpty() && top().pairIndex != pi) {
                    pop();
                }

                if (stackIsNotEmpty()) {
                    sc = top().scriptCode;
                }
            }
        }

        if (sameScript(scriptCode, sc)) {
            if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
                scriptCode = sc;

                fixup(scriptCode);
            }

            // if this character is a close paired character,
            // pop the matching open character from the stack
            if (pairIndex >= 0 && (pairIndex & 1) != 0) {
                pop();
            }
        } else {
            // We've just seen the first character of
            // the next run. Back over it so we'll see
            // it again the next time.
            textIndex -= codePointCount;
            break;
        }
    }

    scriptLimit = textIndex;
    return true;
}
 
Example 11
Source File: PatternTokenizer.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
/**
 * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
 * @param string String passed to quote a literal string.
 * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes.
 */
public String quoteLiteral(String string) {
    if (needingQuoteCharacters == null) {
        needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)
        if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
        if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
    }
    StringBuffer result = new StringBuffer();
    int quotedChar = NO_QUOTE;
    int cp;
    for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
        cp = UTF16.charAt(string, i);
        if (escapeCharacters.contains(cp)) {
            // we may have to fix up previous characters
            if (quotedChar == IN_QUOTE) {
                result.append(SINGLE_QUOTE);
                quotedChar = NO_QUOTE;
            }
            appendEscaped(result, cp);
            continue;
        }
        
        if (needingQuoteCharacters.contains(cp)) {
            // if we have already started a quote
            if (quotedChar == IN_QUOTE) {
                UTF16.append(result, cp);
                if (usingQuote && cp == SINGLE_QUOTE) { // double it
                    result.append(SINGLE_QUOTE);
                }
                continue;
            }
            // otherwise not already in quote
            if (usingSlash) {
                result.append(BACK_SLASH);
                UTF16.append(result, cp);
                continue;
            }
            if (usingQuote) {
                if (cp == SINGLE_QUOTE) { // double it and continue
                    result.append(SINGLE_QUOTE);
                    result.append(SINGLE_QUOTE);
                    continue;
                }
                result.append(SINGLE_QUOTE);
                UTF16.append(result, cp);
                quotedChar = IN_QUOTE;
                continue;
            }
            // we have no choice but to use \\u or \\U
            appendEscaped(result, cp);
            continue;
        }
        // otherwise cp doesn't need quoting
        // we may have to fix up previous characters
        if (quotedChar == IN_QUOTE) {
            result.append(SINGLE_QUOTE);
            quotedChar = NO_QUOTE;
        }
        UTF16.append(result, cp);
    }
    // all done. 
    // we may have to fix up previous characters
    if (quotedChar == IN_QUOTE) {
        result.append(SINGLE_QUOTE);
    }
    return result.toString();
}
 
Example 12
Source File: UCaseProps.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
/**
 * Maps the string to single code points and adds the associated case closure
 * mappings.
 * The string is mapped to code points if it is their full case folding string.
 * In other words, this performs a reverse full case folding and then
 * adds the case closure items of the resulting code points.
 * If the string is found and its closure applied, then
 * the string itself is added as well as part of its code points' closure.
 *
 * @return true if the string was found
 */
public final boolean addStringCaseClosure(String s, UnicodeSet set) {
    int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;

    if(unfold==null || s==null) {
        return false; /* no reverse case folding data, or no string */
    }
    length=s.length();
    if(length<=1) {
        /* the string is too short to find any match */
        /*
         * more precise would be:
         * if(!u_strHasMoreChar32Than(s, length, 1))
         * but this does not make much practical difference because
         * a single supplementary code point would just not be found
         */
        return false;
    }

    unfoldRows=unfold[UNFOLD_ROWS];
    unfoldRowWidth=unfold[UNFOLD_ROW_WIDTH];
    unfoldStringWidth=unfold[UNFOLD_STRING_WIDTH];
    //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;

    if(length>unfoldStringWidth) {
        /* the string is too long to find any match */
        return false;
    }

    /* do a binary search for the string */
    start=0;
    limit=unfoldRows;
    while(start<limit) {
        i=(start+limit)/2;
        unfoldOffset=((i+1)*unfoldRowWidth); // +1 to skip the header values above
        result=strcmpMax(s, unfoldOffset, unfoldStringWidth);

        if(result==0) {
            /* found the string: add each code point, and its case closure */
            int c;

            for(i=unfoldStringWidth; i<unfoldRowWidth && unfold[unfoldOffset+i]!=0; i+=UTF16.getCharCount(c)) {
                c=UTF16.charAt(unfold, unfoldOffset, unfold.length, i);
                set.add(c);
                addCaseClosure(c, set);
            }
            return true;
        } else if(result<0) {
            limit=i;
        } else /* result>0 */ {
            start=i+1;
        }
    }

    return false; /* string not found */
}
 
Example 13
Source File: Utility.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
/**
 * Parse a pattern string within the given Replaceable and a parsing
 * pattern.  Characters are matched literally and case-sensitively
 * except for the following special characters:
 *
 * ~  zero or more Pattern_White_Space chars
 *
 * If end of pattern is reached with all matches along the way,
 * pos is advanced to the first unparsed index and returned.
 * Otherwise -1 is returned.
 * @param pat pattern that controls parsing
 * @param text text to be parsed, starting at index
 * @param index offset to first character to parse
 * @param limit offset after last character to parse
 * @return index after last parsed character, or -1 on parse failure.
 */
public static int parsePattern(String pat,
        Replaceable text,
        int index,
        int limit) {
    int ipat = 0;

    // empty pattern matches immediately
    if (ipat == pat.length()) {
        return index;
    }

    int cpat = Character.codePointAt(pat, ipat);

    while (index < limit) {
        int c = text.char32At(index);

        // parse \s*
        if (cpat == '~') {
            if (PatternProps.isWhiteSpace(c)) {
                index += UTF16.getCharCount(c);
                continue;
            } else {
                if (++ipat == pat.length()) {
                    return index; // success; c unparsed
                }
                // fall thru; process c again with next cpat
            }
        }

        // parse literal
        else if (c == cpat) {
            int n = UTF16.getCharCount(c);
            index += n;
            ipat += n;
            if (ipat == pat.length()) {
                return index; // success; c parsed
            }
            // fall thru; get next cpat
        }

        // match failure of literal
        else {
            return -1;
        }

        cpat = UTF16.charAt(pat, ipat);
    }

    return -1; // text ended before end of pat
}
 
Example 14
Source File: UCaseProps.java    From trekarta with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Maps the string to single code points and adds the associated case closure
 * mappings.
 * The string is mapped to code points if it is their full case folding string.
 * In other words, this performs a reverse full case folding and then
 * adds the case closure items of the resulting code points.
 * If the string is found and its closure applied, then
 * the string itself is added as well as part of its code points' closure.
 *
 * @return true if the string was found
 */
public final boolean addStringCaseClosure(String s, UnicodeSet set) {
    int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;

    if(unfold==null || s==null) {
        return false; /* no reverse case folding data, or no string */
    }
    length=s.length();
    if(length<=1) {
        /* the string is too short to find any match */
        /*
         * more precise would be:
         * if(!u_strHasMoreChar32Than(s, length, 1))
         * but this does not make much practical difference because
         * a single supplementary code point would just not be found
         */
        return false;
    }

    unfoldRows=unfold[UNFOLD_ROWS];
    unfoldRowWidth=unfold[UNFOLD_ROW_WIDTH];
    unfoldStringWidth=unfold[UNFOLD_STRING_WIDTH];
    //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;

    if(length>unfoldStringWidth) {
        /* the string is too long to find any match */
        return false;
    }

    /* do a binary search for the string */
    start=0;
    limit=unfoldRows;
    while(start<limit) {
        i=(start+limit)/2;
        unfoldOffset=((i+1)*unfoldRowWidth); // +1 to skip the header values above
        result=strcmpMax(s, unfoldOffset, unfoldStringWidth);

        if(result==0) {
            /* found the string: add each code point, and its case closure */
            int c;

            for(i=unfoldStringWidth; i<unfoldRowWidth && unfold[unfoldOffset+i]!=0; i+=UTF16.getCharCount(c)) {
                c=UTF16.charAt(unfold, unfoldOffset, unfold.length, i);
                set.add(c);
                addCaseClosure(c, set);
            }
            return true;
        } else if(result<0) {
            limit=i;
        } else /* result>0 */ {
            start=i+1;
        }
    }

    return false; /* string not found */
}
 
Example 15
Source File: Utility.java    From trekarta with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Parse a pattern string within the given Replaceable and a parsing
 * pattern.  Characters are matched literally and case-sensitively
 * except for the following special characters:
 *
 * ~  zero or more Pattern_White_Space chars
 *
 * If end of pattern is reached with all matches along the way,
 * pos is advanced to the first unparsed index and returned.
 * Otherwise -1 is returned.
 * @param pat pattern that controls parsing
 * @param text text to be parsed, starting at index
 * @param index offset to first character to parse
 * @param limit offset after last character to parse
 * @return index after last parsed character, or -1 on parse failure.
 */
public static int parsePattern(String pat,
        Replaceable text,
        int index,
        int limit) {
    int ipat = 0;

    // empty pattern matches immediately
    if (ipat == pat.length()) {
        return index;
    }

    int cpat = Character.codePointAt(pat, ipat);

    while (index < limit) {
        int c = text.char32At(index);

        // parse \s*
        if (cpat == '~') {
            if (PatternProps.isWhiteSpace(c)) {
                index += UTF16.getCharCount(c);
                continue;
            } else {
                if (++ipat == pat.length()) {
                    return index; // success; c unparsed
                }
                // fall thru; process c again with next cpat
            }
        }

        // parse literal
        else if (c == cpat) {
            int n = UTF16.getCharCount(c);
            index += n;
            ipat += n;
            if (ipat == pat.length()) {
                return index; // success; c parsed
            }
            // fall thru; get next cpat
        }

        // match failure of literal
        else {
            return -1;
        }

        cpat = UTF16.charAt(pat, ipat);
    }

    return -1; // text ended before end of pat
}