Java Code Examples for com.ibm.icu.text.UTF16#charAt()

The following examples show how to use com.ibm.icu.text.UTF16#charAt() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BreakIteratorWrapper.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
private int calcStatus(int current, int next) {
    if (current == BreakIterator.DONE || next == BreakIterator.DONE) {
        return RuleBasedBreakIterator.WORD_NONE;
    }
    int begin = start + current;
    int end = start + next;
    int codepoint;
    for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
        codepoint = UTF16.charAt(text, 0, end, begin);
        if (UCharacter.isDigit(codepoint)) {
            return RuleBasedBreakIterator.WORD_NUMBER;
        } else if (UCharacter.isLetter(codepoint)) {
            return RuleBasedBreakIterator.WORD_LETTER;
        }
    }
    return RuleBasedBreakIterator.WORD_NONE;
}
 
Example 2
Source File: BreakIteratorWrapper.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Returns true if the current text represents emoji character or sequence */
private boolean isEmoji(int current, int next) {
  int begin = start + current;
  int end = start + next;
  int codepoint = UTF16.charAt(text, 0, end, begin);
  if (EMOJI.contains(codepoint)) {
    if (EMOJI_RK.contains(codepoint)) {
      // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
      // an emoji presentation selector or keycap follows.
      int trailer = begin + Character.charCount(codepoint);
      return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3);
    } else {
      return true;
    }
  }
  return false;
}
 
Example 3
Source File: StringTokenizer.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
/**
 * Gets the index of the next delimiter after offset
 * @param offset to the source string
 * @return offset of the immediate next delimiter, otherwise 
 *         (- source string length - 1) if there
 *         are no more delimiters after m_nextOffset
 */
private int getNextDelimiter(int offset)
{
    if (offset >= 0) {
        int result = offset; 
        int c = 0;
        if (delims == null) {
            do {
                c = UTF16.charAt(m_source_, result);
                if (m_delimiters_.contains(c)) {
                    break;
                }
                result ++;
            } while (result < m_length_);
        } else {
            do {
                c = UTF16.charAt(m_source_, result);
                if (c < delims.length && delims[c]) {
                    break;
                }
                result ++;
            } while (result < m_length_);
        }                
        if (result < m_length_) {
            return result;
        }
    }
    return -1 - m_length_;
}
 
Example 4
Source File: StringTokenizer.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
/**
 * Gets the index of the next non-delimiter after m_nextOffset_
 * @param offset to the source string
 * @return offset of the immediate next non-delimiter, otherwise 
 *         (- source string length - 1) if there
 *         are no more delimiters after m_nextOffset
 */
private int getNextNonDelimiter(int offset)
{
    if (offset >= 0) {
        int result = offset; 
        int c = 0;
        if (delims == null) {
            do {
                c = UTF16.charAt(m_source_, result);
                if (!m_delimiters_.contains(c)) {
                    break;
                }
                result ++;
            } while (result < m_length_);
        } else {
            do {
                c = UTF16.charAt(m_source_, result);
                if (!(c < delims.length && delims[c])) {
                    break;
                }
                result ++;
            } while (result < m_length_);
        }
        if (result < m_length_) {
            return result;
        }
    }
    return -1 - m_length_;
}
 
Example 5
Source File: ScriptIterator.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * Iterates to the next script run, returning true if one exists.
 *
 * @return true if there is another script run, false otherwise.
 */
boolean next() {
    if (scriptLimit >= limit) {
        return false;
    }
    scriptCode = UScript.COMMON;
    scriptStart = scriptLimit;
    while (index < limit) {
        final int ch = UTF16.charAt(text, start, limit, index - start);
        final int sc = getScript(ch);
        /*
         * From UTR #24: Implementations that determine the boundaries between
         * characters of given scripts should never break between a non-spacing
         * mark and its base character. Thus for boundary determinations and
         * similar sorts of processing, a non-spacing mark — whatever its script
         * value — should inherit the script value of its base character.
         */
        if (isSameScript(scriptCode, sc)
                || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
            index += UTF16.getCharCount(ch);
            /*
             * Inherited or Common becomes the script code of the surrounding text.
             */
            if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
                scriptCode = sc;
            }
        } else {
            break;
        }
    }
    scriptLimit = index;
    return true;
}
 
Example 6
Source File: RuleCharacterIterator.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Returns the current 32-bit code point without parsing escapes, parsing
 * variables, or skipping whitespace.
 * @return the current 32-bit code point
 */
private int _current() {
    if (buf != null) {
        return UTF16.charAt(buf, 0, buf.length, bufPos);
    } else {
        int i = pos.getIndex();
        return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
    }
}
 
Example 7
Source File: RuleCharacterIterator.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the current 32-bit code point without parsing escapes, parsing
 * variables, or skipping whitespace.
 * @return the current 32-bit code point
 */
private int _current() {
    if (buf != null) {
        return UTF16.charAt(buf, 0, buf.length, bufPos);
    } else {
        int i = pos.getIndex();
        return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
    }
}
 
Example 8
Source File: ScriptIterator.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Iterates to the next script run, returning true if one exists.
 * 
 * @return true if there is another script run, false otherwise.
 */
boolean next() {
  if (scriptLimit >= limit)
    return false;

  scriptCode = UScript.COMMON;
  scriptStart = scriptLimit;

  while (index < limit) {
    final int ch = UTF16.charAt(text, start, limit, index - start);
    final int sc = getScript(ch);

    /*
     * From UTR #24: Implementations that determine the boundaries between
     * characters of given scripts should never break between a non-spacing
     * mark and its base character. Thus for boundary determinations and
     * similar sorts of processing, a non-spacing mark — whatever its script
     * value — should inherit the script value of its base character.
     */
    if (isSameScript(scriptCode, sc)
        || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
      index += UTF16.getCharCount(ch);

      /*
       * Inherited or Common becomes the script code of the surrounding text.
       */
      if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
        scriptCode = sc;
      }

    } else {
      break;
    }
  }

  scriptLimit = index;
  return true;
}
 
Example 9
Source File: ICUTransformFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public int char32At(int pos) {
  return UTF16.charAt(buffer, 0, length, pos);
}
 
Example 10
Source File: IcuTransformTokenFilter.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
public int char32At(int pos) {
    return UTF16.charAt(buffer, 0, length, pos);
}
 
Example 11
Source File: Utility.java    From trekarta with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Parse a pattern string within the given Replaceable and a parsing
 * pattern.  Characters are matched literally and case-sensitively
 * except for the following special characters:
 *
 * ~  zero or more Pattern_White_Space chars
 *
 * If end of pattern is reached with all matches along the way,
 * pos is advanced to the first unparsed index and returned.
 * Otherwise -1 is returned.
 * @param pat pattern that controls parsing
 * @param text text to be parsed, starting at index
 * @param index offset to first character to parse
 * @param limit offset after last character to parse
 * @return index after last parsed character, or -1 on parse failure.
 */
public static int parsePattern(String pat,
        Replaceable text,
        int index,
        int limit) {
    int ipat = 0;

    // empty pattern matches immediately
    if (ipat == pat.length()) {
        return index;
    }

    int cpat = Character.codePointAt(pat, ipat);

    while (index < limit) {
        int c = text.char32At(index);

        // parse \s*
        if (cpat == '~') {
            if (PatternProps.isWhiteSpace(c)) {
                index += UTF16.getCharCount(c);
                continue;
            } else {
                if (++ipat == pat.length()) {
                    return index; // success; c unparsed
                }
                // fall thru; process c again with next cpat
            }
        }

        // parse literal
        else if (c == cpat) {
            int n = UTF16.getCharCount(c);
            index += n;
            ipat += n;
            if (ipat == pat.length()) {
                return index; // success; c parsed
            }
            // fall thru; get next cpat
        }

        // match failure of literal
        else {
            return -1;
        }

        cpat = UTF16.charAt(pat, ipat);
    }

    return -1; // text ended before end of pat
}
 
Example 12
Source File: UCaseProps.java    From trekarta with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Maps the string to single code points and adds the associated case closure
 * mappings.
 * The string is mapped to code points if it is their full case folding string.
 * In other words, this performs a reverse full case folding and then
 * adds the case closure items of the resulting code points.
 * If the string is found and its closure applied, then
 * the string itself is added as well as part of its code points' closure.
 *
 * @return true if the string was found
 */
public final boolean addStringCaseClosure(String s, UnicodeSet set) {
    int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;

    if(unfold==null || s==null) {
        return false; /* no reverse case folding data, or no string */
    }
    length=s.length();
    if(length<=1) {
        /* the string is too short to find any match */
        /*
         * more precise would be:
         * if(!u_strHasMoreChar32Than(s, length, 1))
         * but this does not make much practical difference because
         * a single supplementary code point would just not be found
         */
        return false;
    }

    unfoldRows=unfold[UNFOLD_ROWS];
    unfoldRowWidth=unfold[UNFOLD_ROW_WIDTH];
    unfoldStringWidth=unfold[UNFOLD_STRING_WIDTH];
    //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;

    if(length>unfoldStringWidth) {
        /* the string is too long to find any match */
        return false;
    }

    /* do a binary search for the string */
    start=0;
    limit=unfoldRows;
    while(start<limit) {
        i=(start+limit)/2;
        unfoldOffset=((i+1)*unfoldRowWidth); // +1 to skip the header values above
        result=strcmpMax(s, unfoldOffset, unfoldStringWidth);

        if(result==0) {
            /* found the string: add each code point, and its case closure */
            int c;

            for(i=unfoldStringWidth; i<unfoldRowWidth && unfold[unfoldOffset+i]!=0; i+=UTF16.getCharCount(c)) {
                c=UTF16.charAt(unfold, unfoldOffset, unfold.length, i);
                set.add(c);
                addCaseClosure(c, set);
            }
            return true;
        } else if(result<0) {
            limit=i;
        } else /* result>0 */ {
            start=i+1;
        }
    }

    return false; /* string not found */
}
 
Example 13
Source File: UScriptRun.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
/**
 * Find the next script run. Returns <code>false</code> if there
 * isn't another run, returns <code>true</code> if there is.
 *
 * @return <code>false</code> if there isn't another run, <code>true</code> if there is.
 *
 * @internal
 * @deprecated This API is ICU internal only.
 */
@Deprecated
public final boolean next()
{
    // if we've fallen off the end of the text, we're done
    if (scriptLimit >= textLimit) {
        return false;
    }

    scriptCode  = UScript.COMMON;
    scriptStart = scriptLimit;
    
    syncFixup();
    
    while (textIndex < textLimit) {
        int ch = UTF16.charAt(text, textStart, textLimit, textIndex - textStart);
        int codePointCount = UTF16.getCharCount(ch);
        int sc = UScript.getScript(ch);
        int pairIndex = getPairIndex(ch);

        textIndex += codePointCount;
        
        // Paired character handling:
        //
        // if it's an open character, push it onto the stack.
        // if it's a close character, find the matching open on the
        // stack, and use that script code. Any non-matching open
        // characters above it on the stack will be poped.
        if (pairIndex >= 0) {
            if ((pairIndex & 1) == 0) {
                push(pairIndex, scriptCode);
            } else {
                int pi = pairIndex & ~1;

                while (stackIsNotEmpty() && top().pairIndex != pi) {
                    pop();
                }

                if (stackIsNotEmpty()) {
                    sc = top().scriptCode;
                }
            }
        }

        if (sameScript(scriptCode, sc)) {
            if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
                scriptCode = sc;

                fixup(scriptCode);
            }

            // if this character is a close paired character,
            // pop the matching open character from the stack
            if (pairIndex >= 0 && (pairIndex & 1) != 0) {
                pop();
            }
        } else {
            // We've just seen the first character of
            // the next run. Back over it so we'll see
            // it again the next time.
            textIndex -= codePointCount;
            break;
        }
    }

    scriptLimit = textIndex;
    return true;
}
 
Example 14
Source File: Utility.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
/**
 * Parse a pattern string within the given Replaceable and a parsing
 * pattern.  Characters are matched literally and case-sensitively
 * except for the following special characters:
 *
 * ~  zero or more Pattern_White_Space chars
 *
 * If end of pattern is reached with all matches along the way,
 * pos is advanced to the first unparsed index and returned.
 * Otherwise -1 is returned.
 * @param pat pattern that controls parsing
 * @param text text to be parsed, starting at index
 * @param index offset to first character to parse
 * @param limit offset after last character to parse
 * @return index after last parsed character, or -1 on parse failure.
 */
public static int parsePattern(String pat,
        Replaceable text,
        int index,
        int limit) {
    int ipat = 0;

    // empty pattern matches immediately
    if (ipat == pat.length()) {
        return index;
    }

    int cpat = Character.codePointAt(pat, ipat);

    while (index < limit) {
        int c = text.char32At(index);

        // parse \s*
        if (cpat == '~') {
            if (PatternProps.isWhiteSpace(c)) {
                index += UTF16.getCharCount(c);
                continue;
            } else {
                if (++ipat == pat.length()) {
                    return index; // success; c unparsed
                }
                // fall thru; process c again with next cpat
            }
        }

        // parse literal
        else if (c == cpat) {
            int n = UTF16.getCharCount(c);
            index += n;
            ipat += n;
            if (ipat == pat.length()) {
                return index; // success; c parsed
            }
            // fall thru; get next cpat
        }

        // match failure of literal
        else {
            return -1;
        }

        cpat = UTF16.charAt(pat, ipat);
    }

    return -1; // text ended before end of pat
}
 
Example 15
Source File: UCaseProps.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
/**
 * Maps the string to single code points and adds the associated case closure
 * mappings.
 * The string is mapped to code points if it is their full case folding string.
 * In other words, this performs a reverse full case folding and then
 * adds the case closure items of the resulting code points.
 * If the string is found and its closure applied, then
 * the string itself is added as well as part of its code points' closure.
 *
 * @return true if the string was found
 */
public final boolean addStringCaseClosure(String s, UnicodeSet set) {
    int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;

    if(unfold==null || s==null) {
        return false; /* no reverse case folding data, or no string */
    }
    length=s.length();
    if(length<=1) {
        /* the string is too short to find any match */
        /*
         * more precise would be:
         * if(!u_strHasMoreChar32Than(s, length, 1))
         * but this does not make much practical difference because
         * a single supplementary code point would just not be found
         */
        return false;
    }

    unfoldRows=unfold[UNFOLD_ROWS];
    unfoldRowWidth=unfold[UNFOLD_ROW_WIDTH];
    unfoldStringWidth=unfold[UNFOLD_STRING_WIDTH];
    //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;

    if(length>unfoldStringWidth) {
        /* the string is too long to find any match */
        return false;
    }

    /* do a binary search for the string */
    start=0;
    limit=unfoldRows;
    while(start<limit) {
        i=(start+limit)/2;
        unfoldOffset=((i+1)*unfoldRowWidth); // +1 to skip the header values above
        result=strcmpMax(s, unfoldOffset, unfoldStringWidth);

        if(result==0) {
            /* found the string: add each code point, and its case closure */
            int c;

            for(i=unfoldStringWidth; i<unfoldRowWidth && unfold[unfoldOffset+i]!=0; i+=UTF16.getCharCount(c)) {
                c=UTF16.charAt(unfold, unfoldOffset, unfold.length, i);
                set.add(c);
                addCaseClosure(c, set);
            }
            return true;
        } else if(result<0) {
            limit=i;
        } else /* result>0 */ {
            start=i+1;
        }
    }

    return false; /* string not found */
}
 
Example 16
Source File: PatternTokenizer.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
/**
 * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
 * @param string String passed to quote a literal string.
 * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes.
 */
public String quoteLiteral(String string) {
    if (needingQuoteCharacters == null) {
        needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)
        if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
        if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
    }
    StringBuffer result = new StringBuffer();
    int quotedChar = NO_QUOTE;
    int cp;
    for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
        cp = UTF16.charAt(string, i);
        if (escapeCharacters.contains(cp)) {
            // we may have to fix up previous characters
            if (quotedChar == IN_QUOTE) {
                result.append(SINGLE_QUOTE);
                quotedChar = NO_QUOTE;
            }
            appendEscaped(result, cp);
            continue;
        }
        
        if (needingQuoteCharacters.contains(cp)) {
            // if we have already started a quote
            if (quotedChar == IN_QUOTE) {
                UTF16.append(result, cp);
                if (usingQuote && cp == SINGLE_QUOTE) { // double it
                    result.append(SINGLE_QUOTE);
                }
                continue;
            }
            // otherwise not already in quote
            if (usingSlash) {
                result.append(BACK_SLASH);
                UTF16.append(result, cp);
                continue;
            }
            if (usingQuote) {
                if (cp == SINGLE_QUOTE) { // double it and continue
                    result.append(SINGLE_QUOTE);
                    result.append(SINGLE_QUOTE);
                    continue;
                }
                result.append(SINGLE_QUOTE);
                UTF16.append(result, cp);
                quotedChar = IN_QUOTE;
                continue;
            }
            // we have no choice but to use \\u or \\U
            appendEscaped(result, cp);
            continue;
        }
        // otherwise cp doesn't need quoting
        // we may have to fix up previous characters
        if (quotedChar == IN_QUOTE) {
            result.append(SINGLE_QUOTE);
            quotedChar = NO_QUOTE;
        }
        UTF16.append(result, cp);
    }
    // all done. 
    // we may have to fix up previous characters
    if (quotedChar == IN_QUOTE) {
        result.append(SINGLE_QUOTE);
    }
    return result.toString();
}
 
Example 17
Source File: StringTokenizer.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
/**
 * Calculates the number of times that this tokenizer's 
 * <code>nextToken</code> method can be called before it generates an 
 * exception. The current position is not advanced.
 * @return the number of tokens remaining in the string using the 
 *         current delimiter set.
 * @see #nextToken()
 * @stable ICU 2.4
 */
public int countTokens() 
{
    int result = 0;
    if (hasMoreTokens()) {
        if (m_tokenOffset_ >= 0) {
            return m_tokenSize_ - m_tokenOffset_;
        }
        if (m_tokenStart_ == null) {
            m_tokenStart_ = new int[TOKEN_SIZE_];
            m_tokenLimit_ = new int[TOKEN_SIZE_];
        }
        do {
            if (m_tokenStart_.length == result) {
                int temptokenindex[] = m_tokenStart_;
                int temptokensize[] = m_tokenLimit_;
                int originalsize = temptokenindex.length;
                int newsize = originalsize + TOKEN_SIZE_;
                m_tokenStart_ = new int[newsize];
                m_tokenLimit_ = new int[newsize];
                System.arraycopy(temptokenindex, 0, m_tokenStart_, 0, 
                                 originalsize);
                System.arraycopy(temptokensize, 0, m_tokenLimit_, 0, 
                                 originalsize);
            }
            m_tokenStart_[result] = m_nextOffset_;
            if (m_returnDelimiters_) {
                int c = UTF16.charAt(m_source_, m_nextOffset_);
                boolean contains = delims == null 
                    ? m_delimiters_.contains(c) 
                    : c < delims.length && delims[c];
                if (contains) {
                    if (m_coalesceDelimiters_) {
                        m_tokenLimit_[result] = getNextNonDelimiter(
                                                            m_nextOffset_);
                    } else {
                        int p = m_nextOffset_ + 1;
                        if (p == m_length_) {
                            p = -1;
                        }
                        m_tokenLimit_[result] = p;

                    }
                }
                else {
                    m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_);
                }
                m_nextOffset_ = m_tokenLimit_[result];
            }
            else {
                m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_);
                m_nextOffset_ = getNextNonDelimiter(m_tokenLimit_[result]);
            }
            result ++;
        } while (m_nextOffset_ >= 0);
        m_tokenOffset_ = 0;
        m_tokenSize_ = result;
        m_nextOffset_ = m_tokenStart_[0];
    }
    return result;
}