Java Code Examples for android.icu.lang.UCharacter#getType()

The following examples show how to use android.icu.lang.UCharacter#getType() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AlphabeticIndex.java    From j2objc with Apache License 2.0 6 votes vote down vote up
/**
 * Return a list of the first character in each script. Only exposed for testing.
 *
 * @return list of first characters in each script
 * @deprecated This API is ICU internal, only for testing.
 * @hide original deprecated declaration
 * @hide draft / provisional / internal are hidden on Android
 */
@Deprecated
public List<String> getFirstCharactersInScripts() {
    List<String> dest = new ArrayList<String>(200);
    // Fetch the script-first-primary contractions which are defined in the root collator.
    // They all start with U+FDD1.
    UnicodeSet set = new UnicodeSet();
    collatorPrimaryOnly.internalAddContractions(0xFDD1, set);
    if (set.isEmpty()) {
        throw new UnsupportedOperationException(
                "AlphabeticIndex requires script-first-primary contractions");
    }
    for (String boundary : set) {
        int gcMask = 1 << UCharacter.getType(boundary.codePointAt(1));
        if ((gcMask & (GC_L_MASK | GC_CN_MASK)) == 0) {
            // Ignore boundaries for the special reordering groups.
            // Take only those for "real scripts" (where the sample character is a Letter,
            // and the one for unassigned implicit weights (Cn).
            continue;
        }
        dest.add(boundary);
    }
    return dest;
}
 
Example 2
Source File: SpoofChecker.java    From j2objc with Apache License 2.0 6 votes vote down vote up
/**
 * Computes the set of numerics for a string, according to UTS 39 section 5.3.
 */
private void getNumerics(String input, UnicodeSet result) {
    result.clear();

    for (int utf16Offset = 0; utf16Offset < input.length();) {
        int codePoint = Character.codePointAt(input, utf16Offset);
        utf16Offset += Character.charCount(codePoint);

        // Store a representative character for each kind of decimal digit
        if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
            // Store the zero character as a representative for comparison.
            // Unicode guarantees it is codePoint - value
            result.add(codePoint - UCharacter.getNumericValue(codePoint));
        }
    }
}
 
Example 3
Source File: UCharacterName.java    From j2objc with Apache License 2.0 6 votes vote down vote up
/**
* Gets the character extended type
* @param ch character to be tested
* @return extended type it is associated with
*/
private static int getType(int ch)
{
    if (UCharacterUtility.isNonCharacter(ch)) {
        // not a character we return a invalid category count
        return NON_CHARACTER_;
    }
    int result = UCharacter.getType(ch);
    if (result == UCharacterCategory.SURROGATE) {
        if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
            result = LEAD_SURROGATE_;
        }
        else {
            result = TRAIL_SURROGATE_;
        }
    }
    return result;
}
 
Example 4
Source File: NormalizationMonkeyTest.java    From j2objc with Apache License 2.0 6 votes vote down vote up
String getTestSource() {
if (random == null) {
    random = createRandom(); // use test framework's random seed
}
    String source = "";
    int i = 0;
    while (i < (random.nextInt(maxCharCount) + 1)) {
        int codepoint = random.nextInt(maxCodePoint);
        //Elimate unassigned characters
        while (UCharacter.getType(codepoint) == UCharacterCategory.UNASSIGNED) {
            codepoint = random.nextInt(maxCodePoint);
        }
        source = source + UTF16.valueOf(codepoint);
        i++;
    }
    return source;
}
 
Example 5
Source File: RoundTripTest.java    From j2objc with Apache License 2.0 6 votes vote down vote up
public static boolean isCamel(String a) {
    //System.out.println("CamelTest");
    // see if string is of the form aB; e.g. lower, then upper or title
    int cp;
    boolean haveLower = false;
    for (int i = 0; i < a.length(); i += UTF16.getCharCount(cp)) {
        cp = UTF16.charAt(a, i);
        int t = UCharacter.getType(cp);
        //System.out.println("\t" + t + " " + Integer.toString(cp,16) + " " + UCharacter.getName(cp));
        switch (t) {
        case Character.UPPERCASE_LETTER:
            if (haveLower) return true;
            break;
        case Character.TITLECASE_LETTER:
            if (haveLower) return true;
            // drop through, since second letter is lower.
        case Character.LOWERCASE_LETTER:
            haveLower = true;
            break;
        }
    }
    //System.out.println("FALSE");
    return false;
}
 
Example 6
Source File: UCharacterProperty.java    From j2objc with Apache License 2.0 5 votes vote down vote up
@Override
boolean contains(int c) {
    // "horizontal space"
    if(c<=0x9f) {
        return c==9 || c==0x20; /* TAB or SPACE */
    } else {
        /* Zs */
        return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR;
    }
}
 
Example 7
Source File: UCharacterProperty.java    From j2objc with Apache License 2.0 5 votes vote down vote up
@Override
boolean contains(int c) {
    /*
     * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}.
     *
     * The only cntrl character in graph+blank is TAB (in blank).
     * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
     */
    return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c);
}
 
Example 8
Source File: UCharacterProperty.java    From j2objc with Apache License 2.0 5 votes vote down vote up
@Override
boolean contains(int c) {
    /* check ASCII and Fullwidth ASCII a-fA-F */
    if(
        (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
        (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
    ) {
        return true;
    }
    return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER;
}
 
Example 9
Source File: TestCanonicalIterator.java    From j2objc with Apache License 2.0 5 votes vote down vote up
@Test
public void TestExhaustive() {
    int counter = 0;
    CanonicalIterator it = new CanonicalIterator("");
    /*
    CanonicalIterator slowIt = new CanonicalIterator("");
    slowIt.SKIP_ZEROS = false;
    */
    //Transliterator name = Transliterator.getInstance("[^\\u0020-\\u007F] name");
    //Set itSet = new TreeSet();
    //Set slowItSet = new TreeSet();


    for (int i = 0; i < 0x10FFFF; ++i) {

        // skip characters we know don't have decomps
        int type = UCharacter.getType(i);
        if (type == Character.UNASSIGNED || type == Character.PRIVATE_USE
            || type == Character.SURROGATE) continue;

        if ((++counter % 5000) == 0) logln("Testing " + Utility.hex(i,0));

        String s = UTF16.valueOf(i);
        characterTest(s, i, it);

        characterTest(s + "\u0345", i, it);
    }
}
 
Example 10
Source File: UnicodeSetTest.java    From j2objc with Apache License 2.0 5 votes vote down vote up
@Test
public void TestCategories() {
    int failures = 0;
    UnicodeSet set = new UnicodeSet("[:Lu:]");
    expectContainment(set, "ABC", "abc");

    // Make sure generation of L doesn't pollute cached Lu set
    // First generate L, then Lu
    // not used int TOP = 0x200; // Don't need to go over the whole range:
    set = new UnicodeSet("[:L:]");
    for (int i=0; i<0x200; ++i) {
        boolean l = UCharacter.isLetter(i);
        if (l != set.contains((char)i)) {
            errln("FAIL: L contains " + (char)i + " = " + 
                    set.contains((char)i));
            if (++failures == 10) break;
        }
    }

    set = new UnicodeSet("[:Lu:]");
    for (int i=0; i<0x200; ++i) {
        boolean lu = (UCharacter.getType(i) == ECharacterCategory.UPPERCASE_LETTER);
        if (lu != set.contains((char)i)) {
            errln("FAIL: Lu contains " + (char)i + " = " + 
                    set.contains((char)i));
            if (++failures == 20) break;
        }
    }
}
 
Example 11
Source File: UCharacterTest.java    From j2objc with Apache License 2.0 5 votes vote down vote up
@Test
public void TestGetProperty(){
    int[] cases = {UTF16.CODEPOINT_MAX_VALUE+1, UTF16.CODEPOINT_MAX_VALUE+2};
    for(int i=0; i < cases.length; i++)
        if(UCharacter.getType(cases[i]) != 0)
            errln("UCharacter.getType for testing UCharacter.getProperty "
                    + "did not return 0 for passed value of " + cases[i] +
                    " but got " + UCharacter.getType(cases[i]));
}
 
Example 12
Source File: BreakTransliterator.java    From j2objc with Apache License 2.0 4 votes vote down vote up
@Override
protected synchronized void handleTransliterate(Replaceable text, Position pos, boolean incremental) {
    boundaryCount = 0;
    int boundary = 0;
    getBreakIterator(); // Lazy-create it if necessary
    bi.setText(new ReplaceableCharacterIterator(text, pos.start, pos.limit, pos.start));
    // TODO: fix clumsy workaround used below.
    /*
    char[] tempBuffer = new char[text.length()];
    text.getChars(0, text.length(), tempBuffer, 0);
    bi.setText(new StringCharacterIterator(new String(tempBuffer), pos.start, pos.limit, pos.start));
    */
    // end debugging

    // To make things much easier, we will stack the boundaries, and then insert at the end.
    // generally, we won't need too many, since we will be filtered.

    for(boundary = bi.first(); boundary != BreakIterator.DONE && boundary < pos.limit; boundary = bi.next()) {
        if (boundary == 0) continue;
        // HACK: Check to see that preceeding item was a letter

        int cp = UTF16.charAt(text, boundary-1);
        int type = UCharacter.getType(cp);
        //System.out.println(Integer.toString(cp,16) + " (before): " + type);
        if (((1<<type) & LETTER_OR_MARK_MASK) == 0) continue;

        cp = UTF16.charAt(text, boundary);
        type = UCharacter.getType(cp);
        //System.out.println(Integer.toString(cp,16) + " (after): " + type);
        if (((1<<type) & LETTER_OR_MARK_MASK) == 0) continue;

        if (boundaryCount >= boundaries.length) {       // realloc if necessary
            int[] temp = new int[boundaries.length * 2];
            System.arraycopy(boundaries, 0, temp, 0, boundaries.length);
            boundaries = temp;
        }

        boundaries[boundaryCount++] = boundary;
        //System.out.println(boundary);
    }

    int delta = 0;
    int lastBoundary = 0;

    if (boundaryCount != 0) { // if we found something, adjust
        delta = boundaryCount * insertion.length();
        lastBoundary = boundaries[boundaryCount-1];

        // we do this from the end backwards, so that we don't have to keep updating.

        while (boundaryCount > 0) {
            boundary = boundaries[--boundaryCount];
            text.replace(boundary, boundary, insertion);
        }
    }

    // Now fix up the return values
    pos.contextLimit += delta;
    pos.limit += delta;
    pos.start = incremental ? lastBoundary + delta : pos.limit;
}
 
Example 13
Source File: UnicodeSet.java    From j2objc with Apache License 2.0 4 votes vote down vote up
@Override
public boolean contains(int ch) {
    return ((1 << UCharacter.getType(ch)) & mask) != 0;
}
 
Example 14
Source File: UTS46.java    From j2objc with Apache License 2.0 4 votes vote down vote up
private static int U_GET_GC_MASK(int c) {
    return (1<<UCharacter.getType(c));
}
 
Example 15
Source File: BasicTest.java    From j2objc with Apache License 2.0 4 votes vote down vote up
int countFoldFCDExceptions(int foldingOptions) {
    String s, d;
    int c;
    int count;
    int/*unsigned*/ cc, trailCC, foldCC, foldTrailCC;
    Normalizer.QuickCheckResult qcResult;
    int category;
    boolean isNFD;


    logln("Test if case folding may un-FCD a string (folding options 0x)"+hex(foldingOptions));

    count=0;
    for(c=0; c<=0x10ffff; ++c) {
        category=UCharacter.getType(c);
        if(category==UCharacterCategory.UNASSIGNED) {
            continue; // skip unassigned code points
        }
        if(c==0xac00) {
            c=0xd7a3; // skip Hangul - no case folding there
            continue;
        }
        // skip Han blocks - no case folding there either
        if(c==0x3400) {
            c=0x4db5;
            continue;
        }
        if(c==0x4e00) {
            c=0x9fa5;
            continue;
        }
        if(c==0x20000) {
            c=0x2a6d6;
            continue;
        }

        s= UTF16.valueOf(c);

        // get leading and trailing cc for c
        d= Normalizer.decompose(s,false);
        isNFD= s==d;
        cc=UCharacter.getCombiningClass(UTF16.charAt(d,0));
        trailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1));

        // get leading and trailing cc for the case-folding of c
        UCharacter.foldCase(s,(foldingOptions==0));
        d = Normalizer.decompose(s, false);
        foldCC=UCharacter.getCombiningClass(UTF16.charAt(d,0));
        foldTrailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1));

        qcResult=Normalizer.quickCheck(s, Normalizer.FCD,0);


        // bad:
        // - character maps to empty string: adjacent characters may then need reordering
        // - folding has different leading/trailing cc's, and they don't become just 0
        // - folding itself is not FCD
        if( qcResult!=Normalizer.YES ||
            s.length()==0 ||
            (cc!=foldCC && foldCC!=0) || (trailCC!=foldTrailCC && foldTrailCC!=0)
        ) {
            ++count;
            errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")");
            //errln("  cc %02x trailCC %02x    foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x   quickCheck(folded)=%d", cc, trailCC, UTF16.charAt(d,0), foldCC, UTF16.charAt(d,d.length()-1), foldTrailCC, qcResult);
            continue;
        }

        // also bad:
        // if a code point is in NFD but its case folding is not, then
        // unorm_compare will also fail
        if(isNFD && Normalizer.YES!=Normalizer.quickCheck(s, Normalizer.NFD,0)) {
            ++count;
            errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")");
        }
    }

    logln("There are "+hex(count)+" code points for which case-folding may un-FCD a string (folding options"+foldingOptions+"x)" );
    return count;
}