Java Code Examples for com.ibm.icu.lang.UCharacter#getType()

The following examples show how to use com.ibm.icu.lang.UCharacter#getType() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: SpoofChecker.java From fitnotifications with Apache License 2.0

6 votes

/**
 * Computes the set of numerics for a string, according to UTS 39 section 5.3.
 */
private void getNumerics(String input, UnicodeSet result) {
    result.clear();

    for (int utf16Offset = 0; utf16Offset < input.length();) {
        int codePoint = Character.codePointAt(input, utf16Offset);
        utf16Offset += Character.charCount(codePoint);

        // Store a representative character for each kind of decimal digit
        if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
            // Store the zero character as a representative for comparison.
            // Unicode guarantees it is codePoint - value
            result.add(codePoint - UCharacter.getNumericValue(codePoint));
        }
    }
}

Example 2

Source File: UCharacterName.java From fitnotifications with Apache License 2.0

6 votes

/**
* Gets the character extended type
* @param ch character to be tested
* @return extended type it is associated with
*/
private static int getType(int ch)
{
    if (UCharacterUtility.isNonCharacter(ch)) {
        // not a character we return a invalid category count
        return NON_CHARACTER_;
    }
    int result = UCharacter.getType(ch);
    if (result == UCharacterCategory.SURROGATE) {
        if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
            result = LEAD_SURROGATE_;
        }
        else {
            result = TRAIL_SURROGATE_;
        }
    }
    return result;
}

Example 3

Source File: AlphabeticIndex.java From fitnotifications with Apache License 2.0

6 votes

/**
 * Return a list of the first character in each script. Only exposed for testing.
 *
 * @return list of first characters in each script
 * @internal
 * @deprecated This API is ICU internal, only for testing.
 */
@Deprecated
public List<String> getFirstCharactersInScripts() {
    List<String> dest = new ArrayList<String>(200);
    // Fetch the script-first-primary contractions which are defined in the root collator.
    // They all start with U+FDD1.
    UnicodeSet set = new UnicodeSet();
    collatorPrimaryOnly.internalAddContractions(0xFDD1, set);
    if (set.isEmpty()) {
        throw new UnsupportedOperationException(
                "AlphabeticIndex requires script-first-primary contractions");
    }
    for (String boundary : set) {
        int gcMask = 1 << UCharacter.getType(boundary.codePointAt(1));
        if ((gcMask & (GC_L_MASK | GC_CN_MASK)) == 0) {
            // Ignore boundaries for the special reordering groups.
            // Take only those for "real scripts" (where the sample character is a Letter,
            // and the one for unassigned implicit weights (Cn).
            continue;
        }
        dest.add(boundary);
    }
    return dest;
}

Example 4

Source File: UCharacterName.java From trekarta with GNU General Public License v3.0

6 votes

/**
* Gets the character extended type
* @param ch character to be tested
* @return extended type it is associated with
*/
private static int getType(int ch)
{
    if (UCharacterUtility.isNonCharacter(ch)) {
        // not a character we return a invalid category count
        return NON_CHARACTER_;
    }
    int result = UCharacter.getType(ch);
    if (result == UCharacterCategory.SURROGATE) {
        if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
            result = LEAD_SURROGATE_;
        }
        else {
            result = TRAIL_SURROGATE_;
        }
    }
    return result;
}

Example 5

Source File: ScriptIterator.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

/**
 * Iterates to the next script run, returning true if one exists.
 *
 * @return true if there is another script run, false otherwise.
 */
boolean next() {
    if (scriptLimit >= limit) {
        return false;
    }
    scriptCode = UScript.COMMON;
    scriptStart = scriptLimit;
    while (index < limit) {
        final int ch = UTF16.charAt(text, start, limit, index - start);
        final int sc = getScript(ch);
        /*
         * From UTR #24: Implementations that determine the boundaries between
         * characters of given scripts should never break between a non-spacing
         * mark and its base character. Thus for boundary determinations and
         * similar sorts of processing, a non-spacing mark — whatever its script
         * value — should inherit the script value of its base character.
         */
        if (isSameScript(scriptCode, sc)
                || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
            index += UTF16.getCharCount(ch);
            /*
             * Inherited or Common becomes the script code of the surrounding text.
             */
            if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
                scriptCode = sc;
            }
        } else {
            break;
        }
    }
    scriptLimit = index;
    return true;
}

Example 6

Source File: Character.java From juniversal with MIT License

5 votes

/**
 * Gets the general Unicode category of the specified code point.
 * 
 * @param codePoint
 *            the Unicode code point to get the category of.
 * @return the Unicode category of {@code codePoint}.
 */
public static int getType(int codePoint) {
	if (codePoint < 1000 && codePoint > 0) {
		return typeValuesCache[codePoint];
	} 
    int type = UCharacter.getType(codePoint);

    // the type values returned by UCharacter are not compatible with what
    // the spec says.RI's Character type values skip the value 17.
    if (type <= Character.FORMAT) {
        return type;
    }
    return (type + 1);
}

Example 7

Source File: UCharacterProperty.java From fitnotifications with Apache License 2.0

5 votes

@Override
boolean contains(int c) {
    // "horizontal space"
    if(c<=0x9f) {
        return c==9 || c==0x20; /* TAB or SPACE */
    } else {
        /* Zs */
        return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR;
    }
}

Example 8

Source File: UCharacterProperty.java From fitnotifications with Apache License 2.0

5 votes

@Override
boolean contains(int c) {
    /*
     * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}.
     *
     * The only cntrl character in graph+blank is TAB (in blank).
     * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
     */
    return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c);
}

Example 9

Source File: UCharacterProperty.java From fitnotifications with Apache License 2.0

5 votes

@Override
boolean contains(int c) {
    /* check ASCII and Fullwidth ASCII a-fA-F */
    if(
        (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
        (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
    ) {
        return true;
    }
    return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER;
}

Example 10

Source File: ScriptIterator.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Iterates to the next script run, returning true if one exists.
 * 
 * @return true if there is another script run, false otherwise.
 */
boolean next() {
  if (scriptLimit >= limit)
    return false;

  scriptCode = UScript.COMMON;
  scriptStart = scriptLimit;

  while (index < limit) {
    final int ch = UTF16.charAt(text, start, limit, index - start);
    final int sc = getScript(ch);

    /*
     * From UTR #24: Implementations that determine the boundaries between
     * characters of given scripts should never break between a non-spacing
     * mark and its base character. Thus for boundary determinations and
     * similar sorts of processing, a non-spacing mark — whatever its script
     * value — should inherit the script value of its base character.
     */
    if (isSameScript(scriptCode, sc)
        || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
      index += UTF16.getCharCount(ch);

      /*
       * Inherited or Common becomes the script code of the surrounding text.
       */
      if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
        scriptCode = sc;
      }

    } else {
      break;
    }
  }

  scriptLimit = index;
  return true;
}

Example 11

Source File: UCharacterProperty.java From trekarta with GNU General Public License v3.0

5 votes

@Override
boolean contains(int c) {
    /* check ASCII and Fullwidth ASCII a-fA-F */
    if(
        (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
        (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
    ) {
        return true;
    }
    return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER;
}

Example 12

Source File: UCharacterProperty.java From trekarta with GNU General Public License v3.0

5 votes

@Override
boolean contains(int c) {
    // "horizontal space"
    if(c<=0x9f) {
        return c==9 || c==0x20; /* TAB or SPACE */
    } else {
        /* Zs */
        return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR;
    }
}

Example 13

Source File: UCharacterProperty.java From trekarta with GNU General Public License v3.0

5 votes

@Override
boolean contains(int c) {
    /*
     * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}.
     *
     * The only cntrl character in graph+blank is TAB (in blank).
     * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
     */
    return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c);
}

Example 14

Source File: UnicodeSet.java From trekarta with GNU General Public License v3.0

4 votes

@Override
public boolean contains(int ch) {
    return ((1 << UCharacter.getType(ch)) & mask) != 0;
}

Example 15

Source File: UnicodeData.java From es6draft with MIT License

4 votes

@Override
public boolean has(int codePoint, int value) {
    return ((1 << UCharacter.getType(codePoint)) & value) != 0;
}

Example 16

Source File: UTS46.java From trekarta with GNU General Public License v3.0

4 votes

private static int U_GET_GC_MASK(int c) {
    return (1<<UCharacter.getType(c));
}

Example 17

Source File: UTS46.java From fitnotifications with Apache License 2.0

4 votes

private static int U_GET_GC_MASK(int c) {
    return (1<<UCharacter.getType(c));
}

Example 18

Source File: UnicodeSet.java From fitnotifications with Apache License 2.0

4 votes

@Override
public boolean contains(int ch) {
    return ((1 << UCharacter.getType(ch)) & mask) != 0;
}

Example 19

Source File: BreakTransliterator.java From fitnotifications with Apache License 2.0

4 votes

@Override
protected synchronized void handleTransliterate(Replaceable text, Position pos, boolean incremental) {
    boundaryCount = 0;
    int boundary = 0;
    getBreakIterator(); // Lazy-create it if necessary
    bi.setText(new ReplaceableCharacterIterator(text, pos.start, pos.limit, pos.start));
    // TODO: fix clumsy workaround used below.
    /*
    char[] tempBuffer = new char[text.length()];
    text.getChars(0, text.length(), tempBuffer, 0);
    bi.setText(new StringCharacterIterator(new String(tempBuffer), pos.start, pos.limit, pos.start));
    */
    // end debugging

    // To make things much easier, we will stack the boundaries, and then insert at the end.
    // generally, we won't need too many, since we will be filtered.

    for(boundary = bi.first(); boundary != BreakIterator.DONE && boundary < pos.limit; boundary = bi.next()) {
        if (boundary == 0) continue;
        // HACK: Check to see that preceeding item was a letter

        int cp = UTF16.charAt(text, boundary-1);
        int type = UCharacter.getType(cp);
        //System.out.println(Integer.toString(cp,16) + " (before): " + type);
        if (((1<<type) & LETTER_OR_MARK_MASK) == 0) continue;

        cp = UTF16.charAt(text, boundary);
        type = UCharacter.getType(cp);
        //System.out.println(Integer.toString(cp,16) + " (after): " + type);
        if (((1<<type) & LETTER_OR_MARK_MASK) == 0) continue;

        if (boundaryCount >= boundaries.length) {       // realloc if necessary
            int[] temp = new int[boundaries.length * 2];
            System.arraycopy(boundaries, 0, temp, 0, boundaries.length);
            boundaries = temp;
        }

        boundaries[boundaryCount++] = boundary;
        //System.out.println(boundary);
    }

    int delta = 0;
    int lastBoundary = 0;

    if (boundaryCount != 0) { // if we found something, adjust
        delta = boundaryCount * insertion.length();
        lastBoundary = boundaries[boundaryCount-1];

        // we do this from the end backwards, so that we don't have to keep updating.

        while (boundaryCount > 0) {
            boundary = boundaries[--boundaryCount];
            text.replace(boundary, boundary, insertion);
        }
    }

    // Now fix up the return values
    pos.contextLimit += delta;
    pos.limit += delta;
    pos.start = incremental ? lastBoundary + delta : pos.limit;
}

Example 20

Source File: Characters.java From es6draft with MIT License

2 votes

/**
 * Unicode category "Zs" (space separator)
 * 
 * @param c
 *            the character
 * @return {@code true} if the character is space separator
 */
public static boolean isSpaceSeparator(int c) {
    return UCharacter.getType(c) == UCharacterCategory.SPACE_SEPARATOR;
}