com.ibm.icu.text.UTF16 Java Exaples

Source File: IntTrie.java From fitnotifications with Apache License 2.0

6 votes

/**
* Get the value associated with a pair of surrogates.
* @param lead a lead surrogate
* @param trail a trail surrogate
*/
public final int getSurrogateValue(char lead, char trail)
{
    if (!UTF16.isLeadSurrogate(lead) || !UTF16.isTrailSurrogate(trail)) {
        throw new IllegalArgumentException(
            "Argument characters do not form a supplementary character");
    }
    // get fold position for the next trail surrogate
    int offset = getSurrogateOffset(lead, trail);

    // get the real data from the folded lead/trail units
    if (offset > 0) {
        return m_data_[offset];
    }

    // return m_initialValue_ if there is an error
    return m_initialValue_;
}

Source File: Utility.java From trekarta with GNU General Public License v3.0

6 votes

/**
 * Convert a string to separated groups of hex uppercase
 * digits.  E.g., hex('ab'...) => "0041,0042".  Append the output
 * to the given Appendable.
 */
public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) {
    try {
        if (useCodePoints) {
            int cp;
            for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
                cp = Character.codePointAt(s, i);
                if (i != 0) {
                    result.append(separator);
                }
                result.append(hex(cp,width));
            }
        } else {
            for (int i = 0; i < s.length(); ++i) {
                if (i != 0) {
                    result.append(separator);
                }
                result.append(hex(s.charAt(i),width));
            }
        }
        return result;
    } catch (IOException e) {
        throw new IllegalIcuArgumentException(e);
    }
}

Source File: Utility.java From trekarta with GNU General Public License v3.0

6 votes

/**
 * Convert characters outside the range U+0020 to U+007F to
 * Unicode escapes, and convert backslash to a double backslash.
 */
public static final String escape(String s) {
    StringBuilder buf = new StringBuilder();
    for (int i=0; i<s.length(); ) {
        int c = Character.codePointAt(s, i);
        i += UTF16.getCharCount(c);
        if (c >= ' ' && c <= 0x007F) {
            if (c == '\\') {
                buf.append("\\\\"); // That is, "\\"
            } else {
                buf.append((char)c);
            }
        } else {
            boolean four = c <= 0xFFFF;
            buf.append(four ? "\\u" : "\\U");
            buf.append(hex(c, four ? 4 : 8));
        }
    }
    return buf.toString();
}

Source File: TrieIterator.java From trekarta with GNU General Public License v3.0

6 votes

/**
* Checks if we are beginning at the start of a initial block.
* If we are then the rest of the codepoints in this initial block
* has the same values.
* We increment m_nextCodepoint_ and relevant data members if so.
* This is used only in for the supplementary codepoints because
* the offset to the trail indexes could be 0.
* @return true if we are at the start of a initial block.
*/
private final boolean checkNullNextTrailIndex()
{
    if (m_nextIndex_ <= 0) {
        m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
        int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
        int leadBlock =
               m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
                                               Trie.INDEX_STAGE_2_SHIFT_;
        if (m_trie_.m_dataManipulate_ == null) {
            throw new NullPointerException(
                        "The field DataManipulate in this Trie is null");
        }
        m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
                           m_trie_.getValue(leadBlock +
                               (nextLead & Trie.INDEX_STAGE_3_MASK_)));
        m_nextIndex_ --;
        m_nextBlockIndex_ =  DATA_BLOCK_LENGTH_;
        return true;
    }
    return false;
}

Source File: ReplaceableUCharacterIterator.java From trekarta with GNU General Public License v3.0

6 votes

/**
 * Returns the current codepoint
 * @return current codepoint
 */
@Override
public int currentCodePoint(){
    // cannot use charAt due to it different
    // behaviour when index is pointing at a
    // trail surrogate, check for surrogates

    int ch = current();
    if(UTF16.isLeadSurrogate((char)ch)){
        // advance the index to get the next code point
        next();
        // due to post increment semantics current() after next()
        // actually returns the next char which is what we want
        int ch2 = current();
        // current should never change the current index so back off
        previous();

        if(UTF16.isTrailSurrogate((char)ch2)){
            // we found a surrogate pair
            return Character.toCodePoint((char)ch, (char)ch2);
        }
    }
    return ch;
}

Source File: Utility.java From trekarta with GNU General Public License v3.0

6 votes

/**
 * Parse a Unicode identifier from the given string at the given
 * position.  Return the identifier, or null if there is no
 * identifier.
 * @param str the string to parse
 * @param pos INPUT-OUPUT parameter.  On INPUT, pos[0] is the
 * first character to examine.  It must be less than str.length(),
 * and it must not point to a whitespace character.  That is, must
 * have pos[0] < str.length().  On
 * OUTPUT, the position after the last parsed character.
 * @return the Unicode identifier, or null if there is no valid
 * identifier at pos[0].
 */
public static String parseUnicodeIdentifier(String str, int[] pos) {
    // assert(pos[0] < str.length());
    StringBuilder buf = new StringBuilder();
    int p = pos[0];
    while (p < str.length()) {
        int ch = Character.codePointAt(str, p);
        if (buf.length() == 0) {
            if (UCharacter.isUnicodeIdentifierStart(ch)) {
                buf.appendCodePoint(ch);
            } else {
                return null;
            }
        } else {
            if (UCharacter.isUnicodeIdentifierPart(ch)) {
                buf.appendCodePoint(ch);
            } else {
                break;
            }
        }
        p += UTF16.getCharCount(ch);
    }
    pos[0] = p;
    return buf.toString();
}

Source File: Trie.java From trekarta with GNU General Public License v3.0

6 votes

/**
* Internal trie getter from a code point.
* Could be faster(?) but longer with
*   if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
* Gets the offset to data which the codepoint points to
* @param ch codepoint
* @return offset to data
*/
protected final int getCodePointOffset(int ch)
{
    // if ((ch >> 16) == 0) slower
    if (ch < 0) {
        return -1;
    } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
        // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
        return getRawOffset(0, (char)ch);
    } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
        // BMP codepoint
        return getBMPOffset((char)ch);
    } else if (ch <= UCharacter.MAX_VALUE) {
        // look at the construction of supplementary characters
        // trail forms the ends of it.
        return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
                                  (char)(ch & SURROGATE_MASK_));
    } else {
        // return -1 if there is an error, in this case we return
        return -1;
    }
}

Source File: CharTrie.java From trekarta with GNU General Public License v3.0

6 votes

/**
* Gets the value associated with the codepoint.
* If no value is associated with the codepoint, a default value will be
* returned.
* @param ch codepoint
* @return offset to data
*/
public final char getCodePointValue(int ch)
{
    int offset;

    // fastpath for U+0000..U+D7FF
    if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
        // copy of getRawOffset()
        offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_)
                + (ch & INDEX_STAGE_3_MASK_);
        return m_data_[offset];
    }

    // handle U+D800..U+10FFFF
    offset = getCodePointOffset(ch);

    // return -1 if there is an error, in this case we return the default
    // value: m_initialValue_
    return (offset >= 0) ? m_data_[offset] : m_initialValue_;
}

Source File: BreakIteratorWrapper.java From lucene-solr with Apache License 2.0

6 votes

/** Returns true if the current text represents emoji character or sequence */
private boolean isEmoji(int current, int next) {
  int begin = start + current;
  int end = start + next;
  int codepoint = UTF16.charAt(text, 0, end, begin);
  if (EMOJI.contains(codepoint)) {
    if (EMOJI_RK.contains(codepoint)) {
      // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
      // an emoji presentation selector or keycap follows.
      int trailer = begin + Character.charCount(codepoint);
      return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3);
    } else {
      return true;
    }
  }
  return false;
}

Source File: CharacterIteration.java From fitnotifications with Apache License 2.0

6 votes

public static int current32(CharacterIterator ci) {
    char  lead   = ci.current();
    int   retVal = lead;
    if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) {
        return retVal;   
    }
    if (UTF16.isLeadSurrogate(lead)) {
        int  trail = (int)ci.next();
        ci.previous();
        if (UTF16.isTrailSurrogate((char)trail)) {
            retVal = ((lead  - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
                     (trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
                     UTF16.SUPPLEMENTARY_MIN_VALUE;
        }
     } else {
        if (lead == CharacterIterator.DONE) {
            if (ci.getIndex() >= ci.getEndIndex())   {
                retVal = DONE32;   
            }
        }
     }
    return retVal;
}

Source File: CharacterIteration.java From fitnotifications with Apache License 2.0

6 votes

public static int previous32(CharacterIterator ci) {
    if (ci.getIndex() <= ci.getBeginIndex()) {
        return DONE32;   
    }
    char trail = ci.previous();
    int retVal = trail;
    if (UTF16.isTrailSurrogate(trail) && ci.getIndex()>ci.getBeginIndex()) {
        char lead = ci.previous();
        if (UTF16.isLeadSurrogate(lead)) {
            retVal = (((int)lead  - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
                      ((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
                      UTF16.SUPPLEMENTARY_MIN_VALUE;
        } else {
            ci.next();
        }           
    }
    return retVal;
}

Source File: CharacterIteration.java From fitnotifications with Apache License 2.0

6 votes

public static int nextTrail32(CharacterIterator ci, int lead) {
    if (lead == CharacterIterator.DONE && ci.getIndex() >= ci.getEndIndex()) {
        return DONE32;
    }
    int retVal = lead;
    if (lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
        char  cTrail = ci.next();
        if (UTF16.isTrailSurrogate(cTrail)) {
            retVal = ((lead  - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
                        (cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
                        UTF16.SUPPLEMENTARY_MIN_VALUE;
        } else {
            ci.previous();
        }
    }
    return retVal;
}

Source File: UCharacterName.java From fitnotifications with Apache License 2.0

6 votes

/**
* Gets the character extended type
* @param ch character to be tested
* @return extended type it is associated with
*/
private static int getType(int ch)
{
    if (UCharacterUtility.isNonCharacter(ch)) {
        // not a character we return a invalid category count
        return NON_CHARACTER_;
    }
    int result = UCharacter.getType(ch);
    if (result == UCharacterCategory.SURROGATE) {
        if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
            result = LEAD_SURROGATE_;
        }
        else {
            result = TRAIL_SURROGATE_;
        }
    }
    return result;
}

Source File: IntTrie.java From fitnotifications with Apache License 2.0

6 votes

/**
* Gets the value associated with the codepoint.
* If no value is associated with the codepoint, a default value will be
* returned.
* @param ch codepoint
* @return offset to data
*/
public final int getCodePointValue(int ch)
{
    int offset;

    // fastpath for U+0000..U+D7FF
    if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
        // copy of getRawOffset()
        offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_)
                + (ch & INDEX_STAGE_3_MASK_);
        return m_data_[offset];
    }

    // handle U+D800..U+10FFFF
    offset = getCodePointOffset(ch);
    return (offset >= 0) ? m_data_[offset] : m_initialValue_;
}

Source File: Utility.java From fitnotifications with Apache License 2.0

6 votes

/**
 * Parse a Unicode identifier from the given string at the given
 * position.  Return the identifier, or null if there is no
 * identifier.
 * @param str the string to parse
 * @param pos INPUT-OUPUT parameter.  On INPUT, pos[0] is the
 * first character to examine.  It must be less than str.length(),
 * and it must not point to a whitespace character.  That is, must
 * have pos[0] < str.length().  On
 * OUTPUT, the position after the last parsed character.
 * @return the Unicode identifier, or null if there is no valid
 * identifier at pos[0].
 */
public static String parseUnicodeIdentifier(String str, int[] pos) {
    // assert(pos[0] < str.length());
    StringBuilder buf = new StringBuilder();
    int p = pos[0];
    while (p < str.length()) {
        int ch = Character.codePointAt(str, p);
        if (buf.length() == 0) {
            if (UCharacter.isUnicodeIdentifierStart(ch)) {
                buf.appendCodePoint(ch);
            } else {
                return null;
            }
        } else {
            if (UCharacter.isUnicodeIdentifierPart(ch)) {
                buf.appendCodePoint(ch);
            } else {
                break;
            }
        }
        p += UTF16.getCharCount(ch);
    }
    pos[0] = p;
    return buf.toString();
}

Source File: Utility.java From fitnotifications with Apache License 2.0

6 votes

/**
 * Convert a string to separated groups of hex uppercase
 * digits.  E.g., hex('ab'...) => "0041,0042".  Append the output
 * to the given Appendable.
 */
public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) {
    try {
        if (useCodePoints) {
            int cp;
            for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
                cp = Character.codePointAt(s, i);
                if (i != 0) {
                    result.append(separator);
                }
                result.append(hex(cp,width));
            }
        } else {
            for (int i = 0; i < s.length(); ++i) {
                if (i != 0) {
                    result.append(separator);
                }
                result.append(hex(s.charAt(i),width));
            }
        }
        return result;
    } catch (IOException e) {
        throw new IllegalIcuArgumentException(e);
    }
}

Source File: Trie.java From fitnotifications with Apache License 2.0

6 votes

/**
* Internal trie getter from a code point.
* Could be faster(?) but longer with
*   if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
* Gets the offset to data which the codepoint points to
* @param ch codepoint
* @return offset to data
*/
protected final int getCodePointOffset(int ch)
{
    // if ((ch >> 16) == 0) slower
    if (ch < 0) {
        return -1;
    } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
        // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
        return getRawOffset(0, (char)ch);
    } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
        // BMP codepoint
        return getBMPOffset((char)ch);
    } else if (ch <= UCharacter.MAX_VALUE) {
        // look at the construction of supplementary characters
        // trail forms the ends of it.
        return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
                                  (char)(ch & SURROGATE_MASK_));
    } else {
        // return -1 if there is an error, in this case we return
        return -1;
    }
}

Source File: UCharacterName.java From trekarta with GNU General Public License v3.0

6 votes

/**
* Gets the character extended type
* @param ch character to be tested
* @return extended type it is associated with
*/
private static int getType(int ch)
{
    if (UCharacterUtility.isNonCharacter(ch)) {
        // not a character we return a invalid category count
        return NON_CHARACTER_;
    }
    int result = UCharacter.getType(ch);
    if (result == UCharacterCategory.SURROGATE) {
        if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
            result = LEAD_SURROGATE_;
        }
        else {
            result = TRAIL_SURROGATE_;
        }
    }
    return result;
}

Source File: ReplaceableUCharacterIterator.java From fitnotifications with Apache License 2.0

6 votes

/**
 * Returns the current codepoint
 * @return current codepoint
 */
@Override
public int currentCodePoint(){
    // cannot use charAt due to it different
    // behaviour when index is pointing at a
    // trail surrogate, check for surrogates

    int ch = current();
    if(UTF16.isLeadSurrogate((char)ch)){
        // advance the index to get the next code point
        next();
        // due to post increment semantics current() after next()
        // actually returns the next char which is what we want
        int ch2 = current();
        // current should never change the current index so back off
        previous();

        if(UTF16.isTrailSurrogate((char)ch2)){
            // we found a surrogate pair
            return Character.toCodePoint((char)ch, (char)ch2);
        }
    }
    return ch;
}

Source File: BreakIteratorWrapper.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

private int calcStatus(int current, int next) {
    if (current == BreakIterator.DONE || next == BreakIterator.DONE) {
        return RuleBasedBreakIterator.WORD_NONE;
    }
    int begin = start + current;
    int end = start + next;
    int codepoint;
    for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
        codepoint = UTF16.charAt(text, 0, end, begin);
        if (UCharacter.isDigit(codepoint)) {
            return RuleBasedBreakIterator.WORD_NUMBER;
        } else if (UCharacter.isLetter(codepoint)) {
            return RuleBasedBreakIterator.WORD_LETTER;
        }
    }
    return RuleBasedBreakIterator.WORD_NONE;
}

Source File: CharTrie.java From fitnotifications with Apache License 2.0

6 votes

/**
* Gets the value associated with the codepoint.
* If no value is associated with the codepoint, a default value will be
* returned.
* @param ch codepoint
* @return offset to data
*/
public final char getCodePointValue(int ch)
{
    int offset;

    // fastpath for U+0000..U+D7FF
    if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
        // copy of getRawOffset()
        offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_)
                + (ch & INDEX_STAGE_3_MASK_);
        return m_data_[offset];
    }

    // handle U+D800..U+10FFFF
    offset = getCodePointOffset(ch);

    // return -1 if there is an error, in this case we return the default
    // value: m_initialValue_
    return (offset >= 0) ? m_data_[offset] : m_initialValue_;
}

Source File: TrieIterator.java From fitnotifications with Apache License 2.0

6 votes

/**
* Checks if we are beginning at the start of a initial block.
* If we are then the rest of the codepoints in this initial block
* has the same values.
* We increment m_nextCodepoint_ and relevant data members if so.
* This is used only in for the supplementary codepoints because
* the offset to the trail indexes could be 0.
* @return true if we are at the start of a initial block.
*/
private final boolean checkNullNextTrailIndex()
{
    if (m_nextIndex_ <= 0) {
        m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
        int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
        int leadBlock =
               m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
                                               Trie.INDEX_STAGE_2_SHIFT_;
        if (m_trie_.m_dataManipulate_ == null) {
            throw new NullPointerException(
                        "The field DataManipulate in this Trie is null");
        }
        m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
                           m_trie_.getValue(leadBlock +
                               (nextLead & Trie.INDEX_STAGE_3_MASK_)));
        m_nextIndex_ --;
        m_nextBlockIndex_ =  DATA_BLOCK_LENGTH_;
        return true;
    }
    return false;
}

Source File: Utility.java From fitnotifications with Apache License 2.0

6 votes

/**
 * Convert characters outside the range U+0020 to U+007F to
 * Unicode escapes, and convert backslash to a double backslash.
 */
public static final String escape(String s) {
    StringBuilder buf = new StringBuilder();
    for (int i=0; i<s.length(); ) {
        int c = Character.codePointAt(s, i);
        i += UTF16.getCharCount(c);
        if (c >= ' ' && c <= 0x007F) {
            if (c == '\\') {
                buf.append("\\\\"); // That is, "\\"
            } else {
                buf.append((char)c);
            }
        } else {
            boolean four = c <= 0xFFFF;
            buf.append(four ? "\\u" : "\\U");
            buf.append(hex(c, four ? 4 : 8));
        }
    }
    return buf.toString();
}

Source File: ScriptIterator.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

/**
 * Iterates to the next script run, returning true if one exists.
 *
 * @return true if there is another script run, false otherwise.
 */
boolean next() {
    if (scriptLimit >= limit) {
        return false;
    }
    scriptCode = UScript.COMMON;
    scriptStart = scriptLimit;
    while (index < limit) {
        final int ch = UTF16.charAt(text, start, limit, index - start);
        final int sc = getScript(ch);
        /*
         * From UTR #24: Implementations that determine the boundaries between
         * characters of given scripts should never break between a non-spacing
         * mark and its base character. Thus for boundary determinations and
         * similar sorts of processing, a non-spacing mark — whatever its script
         * value — should inherit the script value of its base character.
         */
        if (isSameScript(scriptCode, sc)
                || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
            index += UTF16.getCharCount(ch);
            /*
             * Inherited or Common becomes the script code of the surrounding text.
             */
            if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
                scriptCode = sc;
            }
        } else {
            break;
        }
    }
    scriptLimit = index;
    return true;
}

Source File: CharsTrie.java From fitnotifications with Apache License 2.0

5 votes

/**
 * Traverses the trie from the current state for the
 * one or two UTF-16 code units for this input code point.
 * @param cp A Unicode code point 0..0x10ffff.
 * @return The match/value Result.
 * @stable ICU 4.8
 */
public Result nextForCodePoint(int cp) {
    return cp<=0xffff ?
        next(cp) :
        (next(UTF16.getLeadSurrogate(cp)).hasNext() ?
            next(UTF16.getTrailSurrogate(cp)) :
            Result.NO_MATCH);
}

Source File: Trie.java From trekarta with GNU General Public License v3.0

5 votes

/**
* Gets the offset to data which the BMP character points to
* Treats a lead surrogate as a normal code point.
* @param ch BMP character
* @return offset to data
*/
protected final int getBMPOffset(char ch)
{
    return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
            && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
            ? getRawOffset(LEAD_INDEX_OFFSET_, ch)
            : getRawOffset(0, ch);
            // using a getRawOffset(ch) makes no diff
}

Source File: StringTokenizer.java From fitnotifications with Apache License 2.0

5 votes

/**
 * Gets the index of the next non-delimiter after m_nextOffset_
 * @param offset to the source string
 * @return offset of the immediate next non-delimiter, otherwise 
 *         (- source string length - 1) if there
 *         are no more delimiters after m_nextOffset
 */
private int getNextNonDelimiter(int offset)
{
    if (offset >= 0) {
        int result = offset; 
        int c = 0;
        if (delims == null) {
            do {
                c = UTF16.charAt(m_source_, result);
                if (!m_delimiters_.contains(c)) {
                    break;
                }
                result ++;
            } while (result < m_length_);
        } else {
            do {
                c = UTF16.charAt(m_source_, result);
                if (!(c < delims.length && delims[c])) {
                    break;
                }
                result ++;
            } while (result < m_length_);
        }
        if (result < m_length_) {
            return result;
        }
    }
    return -1 - m_length_;
}

Source File: StringTokenizer.java From fitnotifications with Apache License 2.0

5 votes

/**
 * Gets the index of the next delimiter after offset
 * @param offset to the source string
 * @return offset of the immediate next delimiter, otherwise 
 *         (- source string length - 1) if there
 *         are no more delimiters after m_nextOffset
 */
private int getNextDelimiter(int offset)
{
    if (offset >= 0) {
        int result = offset; 
        int c = 0;
        if (delims == null) {
            do {
                c = UTF16.charAt(m_source_, result);
                if (m_delimiters_.contains(c)) {
                    break;
                }
                result ++;
            } while (result < m_length_);
        } else {
            do {
                c = UTF16.charAt(m_source_, result);
                if (c < delims.length && delims[c]) {
                    break;
                }
                result ++;
            } while (result < m_length_);
        }                
        if (result < m_length_) {
            return result;
        }
    }
    return -1 - m_length_;
}

Source File: ScriptIterator.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Iterates to the next script run, returning true if one exists.
 * 
 * @return true if there is another script run, false otherwise.
 */
boolean next() {
  if (scriptLimit >= limit)
    return false;

  scriptCode = UScript.COMMON;
  scriptStart = scriptLimit;

  while (index < limit) {
    final int ch = UTF16.charAt(text, start, limit, index - start);
    final int sc = getScript(ch);

    /*
     * From UTR #24: Implementations that determine the boundaries between
     * characters of given scripts should never break between a non-spacing
     * mark and its base character. Thus for boundary determinations and
     * similar sorts of processing, a non-spacing mark — whatever its script
     * value — should inherit the script value of its base character.
     */
    if (isSameScript(scriptCode, sc)
        || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
      index += UTF16.getCharCount(ch);

      /*
       * Inherited or Common becomes the script code of the surrounding text.
       */
      if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
        scriptCode = sc;
      }

    } else {
      break;
    }
  }

  scriptLimit = index;
  return true;
}

Source File: Trie.java From fitnotifications with Apache License 2.0

5 votes

/**
* Gets the offset to data which the BMP character points to
* Treats a lead surrogate as a normal code point.
* @param ch BMP character
* @return offset to data
*/
protected final int getBMPOffset(char ch)
{
    return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
            && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
            ? getRawOffset(LEAD_INDEX_OFFSET_, ch)
            : getRawOffset(0, ch);
            // using a getRawOffset(ch) makes no diff
}

com.ibm.icu.text.UTF16 Java Examples