Java Code Examples for com.ibm.icu.lang.UCharacter#getType()

The following examples show how to use com.ibm.icu.lang.UCharacter#getType() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SpoofChecker.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
/**
 * Computes the set of numerics for a string, according to UTS 39 section 5.3.
 */
private void getNumerics(String input, UnicodeSet result) {
    result.clear();

    for (int utf16Offset = 0; utf16Offset < input.length();) {
        int codePoint = Character.codePointAt(input, utf16Offset);
        utf16Offset += Character.charCount(codePoint);

        // Store a representative character for each kind of decimal digit
        if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
            // Store the zero character as a representative for comparison.
            // Unicode guarantees it is codePoint - value
            result.add(codePoint - UCharacter.getNumericValue(codePoint));
        }
    }
}
 
Example 2
Source File: UCharacterName.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
/**
* Gets the character extended type
* @param ch character to be tested
* @return extended type it is associated with
*/
private static int getType(int ch)
{
    if (UCharacterUtility.isNonCharacter(ch)) {
        // not a character we return a invalid category count
        return NON_CHARACTER_;
    }
    int result = UCharacter.getType(ch);
    if (result == UCharacterCategory.SURROGATE) {
        if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
            result = LEAD_SURROGATE_;
        }
        else {
            result = TRAIL_SURROGATE_;
        }
    }
    return result;
}
 
Example 3
Source File: AlphabeticIndex.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
/**
 * Return a list of the first character in each script. Only exposed for testing.
 *
 * @return list of first characters in each script
 * @internal
 * @deprecated This API is ICU internal, only for testing.
 */
@Deprecated
public List<String> getFirstCharactersInScripts() {
    List<String> dest = new ArrayList<String>(200);
    // Fetch the script-first-primary contractions which are defined in the root collator.
    // They all start with U+FDD1.
    UnicodeSet set = new UnicodeSet();
    collatorPrimaryOnly.internalAddContractions(0xFDD1, set);
    if (set.isEmpty()) {
        throw new UnsupportedOperationException(
                "AlphabeticIndex requires script-first-primary contractions");
    }
    for (String boundary : set) {
        int gcMask = 1 << UCharacter.getType(boundary.codePointAt(1));
        if ((gcMask & (GC_L_MASK | GC_CN_MASK)) == 0) {
            // Ignore boundaries for the special reordering groups.
            // Take only those for "real scripts" (where the sample character is a Letter,
            // and the one for unassigned implicit weights (Cn).
            continue;
        }
        dest.add(boundary);
    }
    return dest;
}
 
Example 4
Source File: UCharacterName.java    From trekarta with GNU General Public License v3.0 6 votes vote down vote up
/**
* Gets the character extended type
* @param ch character to be tested
* @return extended type it is associated with
*/
private static int getType(int ch)
{
    if (UCharacterUtility.isNonCharacter(ch)) {
        // not a character we return a invalid category count
        return NON_CHARACTER_;
    }
    int result = UCharacter.getType(ch);
    if (result == UCharacterCategory.SURROGATE) {
        if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
            result = LEAD_SURROGATE_;
        }
        else {
            result = TRAIL_SURROGATE_;
        }
    }
    return result;
}
 
Example 5
Source File: ScriptIterator.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * Iterates to the next script run, returning true if one exists.
 *
 * @return true if there is another script run, false otherwise.
 */
boolean next() {
    if (scriptLimit >= limit) {
        return false;
    }
    scriptCode = UScript.COMMON;
    scriptStart = scriptLimit;
    while (index < limit) {
        final int ch = UTF16.charAt(text, start, limit, index - start);
        final int sc = getScript(ch);
        /*
         * From UTR #24: Implementations that determine the boundaries between
         * characters of given scripts should never break between a non-spacing
         * mark and its base character. Thus for boundary determinations and
         * similar sorts of processing, a non-spacing mark — whatever its script
         * value — should inherit the script value of its base character.
         */
        if (isSameScript(scriptCode, sc)
                || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
            index += UTF16.getCharCount(ch);
            /*
             * Inherited or Common becomes the script code of the surrounding text.
             */
            if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
                scriptCode = sc;
            }
        } else {
            break;
        }
    }
    scriptLimit = index;
    return true;
}
 
Example 6
Source File: Character.java    From juniversal with MIT License 5 votes vote down vote up
/**
 * Gets the general Unicode category of the specified code point.
 * 
 * @param codePoint
 *            the Unicode code point to get the category of.
 * @return the Unicode category of {@code codePoint}.
 */
public static int getType(int codePoint) {
	if (codePoint < 1000 && codePoint > 0) {
		return typeValuesCache[codePoint];
	} 
    int type = UCharacter.getType(codePoint);

    // the type values returned by UCharacter are not compatible with what
    // the spec says.RI's Character type values skip the value 17.
    if (type <= Character.FORMAT) {
        return type;
    }
    return (type + 1);
}
 
Example 7
Source File: UCharacterProperty.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
@Override
boolean contains(int c) {
    // "horizontal space"
    if(c<=0x9f) {
        return c==9 || c==0x20; /* TAB or SPACE */
    } else {
        /* Zs */
        return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR;
    }
}
 
Example 8
Source File: UCharacterProperty.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
@Override
boolean contains(int c) {
    /*
     * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}.
     *
     * The only cntrl character in graph+blank is TAB (in blank).
     * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
     */
    return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c);
}
 
Example 9
Source File: UCharacterProperty.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
@Override
boolean contains(int c) {
    /* check ASCII and Fullwidth ASCII a-fA-F */
    if(
        (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
        (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
    ) {
        return true;
    }
    return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER;
}
 
Example 10
Source File: ScriptIterator.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Iterates to the next script run, returning true if one exists.
 * 
 * @return true if there is another script run, false otherwise.
 */
boolean next() {
  if (scriptLimit >= limit)
    return false;

  scriptCode = UScript.COMMON;
  scriptStart = scriptLimit;

  while (index < limit) {
    final int ch = UTF16.charAt(text, start, limit, index - start);
    final int sc = getScript(ch);

    /*
     * From UTR #24: Implementations that determine the boundaries between
     * characters of given scripts should never break between a non-spacing
     * mark and its base character. Thus for boundary determinations and
     * similar sorts of processing, a non-spacing mark — whatever its script
     * value — should inherit the script value of its base character.
     */
    if (isSameScript(scriptCode, sc)
        || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
      index += UTF16.getCharCount(ch);

      /*
       * Inherited or Common becomes the script code of the surrounding text.
       */
      if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
        scriptCode = sc;
      }

    } else {
      break;
    }
  }

  scriptLimit = index;
  return true;
}
 
Example 11
Source File: UCharacterProperty.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
@Override
boolean contains(int c) {
    /* check ASCII and Fullwidth ASCII a-fA-F */
    if(
        (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
        (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
    ) {
        return true;
    }
    return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER;
}
 
Example 12
Source File: UCharacterProperty.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
@Override
boolean contains(int c) {
    // "horizontal space"
    if(c<=0x9f) {
        return c==9 || c==0x20; /* TAB or SPACE */
    } else {
        /* Zs */
        return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR;
    }
}
 
Example 13
Source File: UCharacterProperty.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
@Override
boolean contains(int c) {
    /*
     * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}.
     *
     * The only cntrl character in graph+blank is TAB (in blank).
     * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
     */
    return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c);
}
 
Example 14
Source File: UnicodeSet.java    From trekarta with GNU General Public License v3.0 4 votes vote down vote up
@Override
public boolean contains(int ch) {
    return ((1 << UCharacter.getType(ch)) & mask) != 0;
}
 
Example 15
Source File: UnicodeData.java    From es6draft with MIT License 4 votes vote down vote up
@Override
public boolean has(int codePoint, int value) {
    return ((1 << UCharacter.getType(codePoint)) & value) != 0;
}
 
Example 16
Source File: UTS46.java    From trekarta with GNU General Public License v3.0 4 votes vote down vote up
private static int U_GET_GC_MASK(int c) {
    return (1<<UCharacter.getType(c));
}
 
Example 17
Source File: UTS46.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
private static int U_GET_GC_MASK(int c) {
    return (1<<UCharacter.getType(c));
}
 
Example 18
Source File: UnicodeSet.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
@Override
public boolean contains(int ch) {
    return ((1 << UCharacter.getType(ch)) & mask) != 0;
}
 
Example 19
Source File: BreakTransliterator.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
@Override
protected synchronized void handleTransliterate(Replaceable text, Position pos, boolean incremental) {
    boundaryCount = 0;
    int boundary = 0;
    getBreakIterator(); // Lazy-create it if necessary
    bi.setText(new ReplaceableCharacterIterator(text, pos.start, pos.limit, pos.start));
    // TODO: fix clumsy workaround used below.
    /*
    char[] tempBuffer = new char[text.length()];
    text.getChars(0, text.length(), tempBuffer, 0);
    bi.setText(new StringCharacterIterator(new String(tempBuffer), pos.start, pos.limit, pos.start));
    */
    // end debugging

    // To make things much easier, we will stack the boundaries, and then insert at the end.
    // generally, we won't need too many, since we will be filtered.

    for(boundary = bi.first(); boundary != BreakIterator.DONE && boundary < pos.limit; boundary = bi.next()) {
        if (boundary == 0) continue;
        // HACK: Check to see that preceeding item was a letter

        int cp = UTF16.charAt(text, boundary-1);
        int type = UCharacter.getType(cp);
        //System.out.println(Integer.toString(cp,16) + " (before): " + type);
        if (((1<<type) & LETTER_OR_MARK_MASK) == 0) continue;

        cp = UTF16.charAt(text, boundary);
        type = UCharacter.getType(cp);
        //System.out.println(Integer.toString(cp,16) + " (after): " + type);
        if (((1<<type) & LETTER_OR_MARK_MASK) == 0) continue;

        if (boundaryCount >= boundaries.length) {       // realloc if necessary
            int[] temp = new int[boundaries.length * 2];
            System.arraycopy(boundaries, 0, temp, 0, boundaries.length);
            boundaries = temp;
        }

        boundaries[boundaryCount++] = boundary;
        //System.out.println(boundary);
    }

    int delta = 0;
    int lastBoundary = 0;

    if (boundaryCount != 0) { // if we found something, adjust
        delta = boundaryCount * insertion.length();
        lastBoundary = boundaries[boundaryCount-1];

        // we do this from the end backwards, so that we don't have to keep updating.

        while (boundaryCount > 0) {
            boundary = boundaries[--boundaryCount];
            text.replace(boundary, boundary, insertion);
        }
    }

    // Now fix up the return values
    pos.contextLimit += delta;
    pos.limit += delta;
    pos.start = incremental ? lastBoundary + delta : pos.limit;
}
 
Example 20
Source File: Characters.java    From es6draft with MIT License 2 votes vote down vote up
/**
 * Unicode category "Zs" (space separator)
 * 
 * @param c
 *            the character
 * @return {@code true} if the character is space separator
 */
public static boolean isSpaceSeparator(int c) {
    return UCharacter.getType(c) == UCharacterCategory.SPACE_SEPARATOR;
}