Java Code Examples for com.ibm.icu.text.UnicodeSet#add()

The following examples show how to use com.ibm.icu.text.UnicodeSet#add() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Normalizer2Impl.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) {
    /* add the start code point to the USet */
    set.add(start);
    if(start!=end && isAlgorithmicNoNo(value)) {
        // Range of code points with same-norm16-value algorithmic decompositions.
        // They might have different non-zero FCD16 values.
        int prevFCD16=getFCD16(start);
        while(++start<=end) {
            int fcd16=getFCD16(start);
            if(fcd16!=prevFCD16) {
                set.add(start);
                prevFCD16=fcd16;
            }
        }
    }
}
 
Example 2
Source File: Normalizer2Impl.java    From trekarta with GNU General Public License v3.0 6 votes vote down vote up
/**
 * @param list some character's compositions list
 * @param set recursively receives the composites from these compositions
 */
private void addComposites(int list, UnicodeSet set) {
    int firstUnit, compositeAndFwd;
    do {
        firstUnit=maybeYesCompositions.charAt(list);
        if((firstUnit&COMP_1_TRIPLE)==0) {
            compositeAndFwd=maybeYesCompositions.charAt(list+1);
            list+=2;
        } else {
            compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
                            maybeYesCompositions.charAt(list+2);
            list+=3;
        }
        int composite=compositeAndFwd>>1;
        if((compositeAndFwd&1)!=0) {
            addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set);
        }
        set.add(composite);
    } while((firstUnit&COMP_1_LAST_TUPLE)==0);
}
 
Example 3
Source File: Normalizer2Impl.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
/**
 * Returns true if there are characters whose decomposition starts with c.
 * If so, then the set is cleared and then filled with those characters.
 * <b>{@link #ensureCanonIterData()} must have been called before this method,
 * or else this method will crash.</b>
 * @param c A Unicode code point.
 * @param set A UnicodeSet to receive the characters whose decompositions
 *        start with c, if there are any.
 * @return true if there are characters whose decomposition starts with c.
 */
public boolean getCanonStartSet(int c, UnicodeSet set) {
    int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
    if(canonValue==0) {
        return false;
    }
    set.clear();
    int value=canonValue&CANON_VALUE_MASK;
    if((canonValue&CANON_HAS_SET)!=0) {
        set.addAll(canonStartSets.get(value));
    } else if(value!=0) {
        set.add(value);
    }
    if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
        int norm16=getNorm16(c);
        if(norm16==JAMO_L) {
            int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
            set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
        } else {
            addComposites(getCompositionsList(norm16), set);
        }
    }
    return true;
}
 
Example 4
Source File: Normalizer2Impl.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
/**
 * @param list some character's compositions list
 * @param set recursively receives the composites from these compositions
 */
private void addComposites(int list, UnicodeSet set) {
    int firstUnit, compositeAndFwd;
    do {
        firstUnit=maybeYesCompositions.charAt(list);
        if((firstUnit&COMP_1_TRIPLE)==0) {
            compositeAndFwd=maybeYesCompositions.charAt(list+1);
            list+=2;
        } else {
            compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
                            maybeYesCompositions.charAt(list+2);
            list+=3;
        }
        int composite=compositeAndFwd>>1;
        if((compositeAndFwd&1)!=0) {
            addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
        }
        set.add(composite);
    } while((firstUnit&COMP_1_LAST_TUPLE)==0);
}
 
Example 5
Source File: UCaseProps.java    From trekarta with GNU General Public License v3.0 6 votes vote down vote up
public final void addPropertyStarts(UnicodeSet set) {
    /* add the start code point of each same-value range of the trie */
    Iterator<Trie2.Range> trieIterator=trie.iterator();
    Trie2.Range range;
    while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
        set.add(range.startCodePoint);
    }

    /* add code points with hardcoded properties, plus the ones following them */

    /* (none right now, see comment below) */

    /*
     * Omit code points with hardcoded specialcasing properties
     * because we do not build property UnicodeSets for them right now.
     */
}
 
Example 6
Source File: UCaseProps.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
public final void addPropertyStarts(UnicodeSet set) {
    /* add the start code point of each same-value range of the trie */
    Iterator<Trie2.Range> trieIterator=trie.iterator();
    Trie2.Range range;
    while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
        set.add(range.startCodePoint);
    }

    /* add code points with hardcoded properties, plus the ones following them */

    /* (none right now, see comment below) */

    /*
     * Omit code points with hardcoded specialcasing properties
     * because we do not build property UnicodeSets for them right now.
     */
}
 
Example 7
Source File: UCharacterProperty.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
public void upropsvec_addPropertyStarts(UnicodeSet set) {
    /* add the start code point of each same-value range of the properties vectors trie */
    if(m_additionalColumnsCount_>0) {
        /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
        Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
        Trie2.Range range;
        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
            set.add(range.startCodePoint);
        }
    }
}
 
Example 8
Source File: Locale.java    From tcl-regex-java with Apache License 2.0 5 votes vote down vote up
/**
 * eclass - Because we have no MCCE support, this
 * just processing single characters.
 */
static UnicodeSet eclass(char c, boolean cases) {

/* otherwise, none */
    if (cases) {
        return allcases(c);
    } else {
        UnicodeSet set = new UnicodeSet();
        set.add(c);
        return set;
    }
}
 
Example 9
Source File: Locale.java    From tcl-regex-java with Apache License 2.0 5 votes vote down vote up
/**
 * allcases - supply cvec for all case counterparts of a chr (including itself)
 * This is a shortcut, preferably an efficient one, for simple characters;
 * messy cases are done via range().
 */
static UnicodeSet allcases(int c) {
    UnicodeSet set = new UnicodeSet();
    set.add(c);
    set.closeOver(UnicodeSet.ADD_CASE_MAPPINGS);
    return set;
}
 
Example 10
Source File: CharacterPropertiesImpl.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
private static UnicodeSet getIntPropInclusions(int prop) {
    assert(UProperty.INT_START <= prop && prop < UProperty.INT_LIMIT);
    int inclIndex = UCharacterProperty.SRC_COUNT + prop - UProperty.INT_START;
    if (inclusions[inclIndex] != null) {
        return inclusions[inclIndex];
    }
    int src = UCharacterProperty.INSTANCE.getSource(prop);
    UnicodeSet incl = getInclusionsForSource(src);

    UnicodeSet intPropIncl = new UnicodeSet(0, 0);
    int numRanges = incl.getRangeCount();
    int prevValue = 0;
    for (int i = 0; i < numRanges; ++i) {
        int rangeEnd = incl.getRangeEnd(i);
        for (int c = incl.getRangeStart(i); c <= rangeEnd; ++c) {
            // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
            int value = UCharacter.getIntPropertyValue(c, prop);
            if (value != prevValue) {
                intPropIncl.add(c);
                prevValue = value;
            }
        }
    }

    // Compact for caching.
    return inclusions[inclIndex] = intPropIncl.compact();
}
 
Example 11
Source File: UCharacterName.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Converts the char set cset into a Unicode set uset.
 * Equivalent to charSetToUSet.
 * @param set Set of 256 bit flags corresponding to a set of chars.
 * @param uset USet to receive characters. Existing contents are deleted.
 */
private void convert(int set[], UnicodeSet uset)
{
    uset.clear();
    if (!initNameSetsLengths()) {
        return;
    }

    // build a char string with all chars that are used in character names
    for (char c = 255; c > 0; c --) {
        if (contains(set, c)) {
            uset.add(c);
        }
    }
}
 
Example 12
Source File: UCharacterProperty.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
public void upropsvec_addPropertyStarts(UnicodeSet set) {
    /* add the start code point of each same-value range of the properties vectors trie */
    if(m_additionalColumnsCount_>0) {
        /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
        Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
        Trie2.Range range;
        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
            set.add(range.startCodePoint);
        }
    }
}
 
Example 13
Source File: UCharacterName.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
/**
 * Converts the char set cset into a Unicode set uset.
 * Equivalent to charSetToUSet.
 * @param set Set of 256 bit flags corresponding to a set of chars.
 * @param uset USet to receive characters. Existing contents are deleted.
 */
private void convert(int set[], UnicodeSet uset)
{
    uset.clear();
    if (!initNameSetsLengths()) {
        return;
    }

    // build a char string with all chars that are used in character names
    for (char c = 255; c > 0; c --) {
        if (contains(set, c)) {
            uset.add(c);
        }
    }
}
 
Example 14
Source File: UCharacterProperty.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
public UnicodeSet addPropertyStarts(int src, UnicodeSet set) {
    CodePointTrie trie;
    switch (src) {
    case SRC_INPC:
        trie = inpcTrie;
        break;
    case SRC_INSC:
        trie = inscTrie;
        break;
    case SRC_VO:
        trie = voTrie;
        break;
    default:
        throw new IllegalStateException();
    }

    if (trie == null) {
        throw new MissingResourceException(
                "no data for one of the text layout properties; src=" + src,
                "LayoutProps", "");
    }

    // Add the start code point of each same-value range of the trie.
    CodePointMap.Range range = new CodePointMap.Range();
    int start = 0;
    while (trie.getRange(start, null, range)) {
        set.add(start);
        start = range.getEnd() + 1;
    }
    return set;
}
 
Example 15
Source File: Normalizer2Impl.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
public void addCanonIterPropertyStarts(UnicodeSet set) {
    /* add the start code point of each same-value range of the canonical iterator data trie */
    ensureCanonIterData();
    // currently only used for the SEGMENT_STARTER property
    Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper);
    Trie2.Range range;
    while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
        /* add the start code point to the USet */
        set.add(range.startCodePoint);
    }
}
 
Example 16
Source File: Normalizer2Impl.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
public void addPropertyStarts(UnicodeSet set) {
    /* add the start code point of each same-value range of each trie */
    Iterator<Trie2.Range> trieIterator=normTrie.iterator();
    Trie2.Range range;
    while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
        enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set);
    }

    /* add Hangul LV syllables and LV+1 because of skippables */
    for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) {
        set.add(c);
        set.add(c+1);
    }
    set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
}
 
Example 17
Source File: UBiDiProps.java    From trekarta with GNU General Public License v3.0 4 votes vote down vote up
public final void addPropertyStarts(UnicodeSet set) {
    int i, length;
    int c, start, limit;

    byte prev, jg;

    /* add the start code point of each same-value range of the trie */
    Iterator<Trie2.Range> trieIterator=trie.iterator();
    Trie2.Range range;
    while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
        set.add(range.startCodePoint);
    }

    /* add the code points from the bidi mirroring table */
    length=indexes[IX_MIRROR_LENGTH];
    for(i=0; i<length; ++i) {
        c=getMirrorCodePoint(mirrors[i]);
        set.add(c, c+1);
    }

    /* add the code points from the Joining_Group array where the value changes */
    start=indexes[IX_JG_START];
    limit=indexes[IX_JG_LIMIT];
    byte[] jga=jgArray;
    for(;;) {
        length=limit-start;
        prev=0;
        for(i=0; i<length; ++i) {
            jg=jga[i];
            if(jg!=prev) {
                set.add(start);
                prev=jg;
            }
            ++start;
        }
        if(prev!=0) {
            /* add the limit code point if the last value was not 0 (it is now start==limit) */
            set.add(limit);
        }
        if(limit==indexes[IX_JG_LIMIT]) {
            /* switch to the second Joining_Group range */
            start=indexes[IX_JG_START2];
            limit=indexes[IX_JG_LIMIT2];
            jga=jgArray2;
        } else {
            break;
        }
    }

    /* add code points with hardcoded properties, plus the ones following them */

    /* (none right now) */
}
 
Example 18
Source File: UBiDiProps.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
public final void addPropertyStarts(UnicodeSet set) {
    int i, length;
    int c, start, limit;

    byte prev, jg;

    /* add the start code point of each same-value range of the trie */
    Iterator<Trie2.Range> trieIterator=trie.iterator();
    Trie2.Range range;
    while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
        set.add(range.startCodePoint);
    }

    /* add the code points from the bidi mirroring table */
    length=indexes[IX_MIRROR_LENGTH];
    for(i=0; i<length; ++i) {
        c=getMirrorCodePoint(mirrors[i]);
        set.add(c, c+1);
    }

    /* add the code points from the Joining_Group array where the value changes */
    start=indexes[IX_JG_START];
    limit=indexes[IX_JG_LIMIT];
    byte[] jga=jgArray;
    for(;;) {
        length=limit-start;
        prev=0;
        for(i=0; i<length; ++i) {
            jg=jga[i];
            if(jg!=prev) {
                set.add(start);
                prev=jg;
            }
            ++start;
        }
        if(prev!=0) {
            /* add the limit code point if the last value was not 0 (it is now start==limit) */
            set.add(limit);
        }
        if(limit==indexes[IX_JG_LIMIT]) {
            /* switch to the second Joining_Group range */
            start=indexes[IX_JG_START2];
            limit=indexes[IX_JG_LIMIT2];
            jga=jgArray2;
        } else {
            break;
        }
    }

    /* add code points with hardcoded properties, plus the ones following them */

    /* (none right now) */
}
 
Example 19
Source File: UCharacterProperty.java    From trekarta with GNU General Public License v3.0 4 votes vote down vote up
public UnicodeSet addPropertyStarts(UnicodeSet set) {
    /* add the start code point of each same-value range of the main trie */
    Iterator<Trie2.Range> trieIterator = m_trie_.iterator();
    Trie2.Range range;
    while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
        set.add(range.startCodePoint);
    }

    /* add code points with hardcoded properties, plus the ones following them */

    /* add for u_isblank() */
    set.add(TAB);
    set.add(TAB+1);

    /* add for IS_THAT_CONTROL_SPACE() */
    set.add(CR+1); /* range TAB..CR */
    set.add(0x1c);
    set.add(0x1f+1);
    set.add(NL);
    set.add(NL+1);

    /* add for u_isIDIgnorable() what was not added above */
    set.add(DEL); /* range DEL..NBSP-1, NBSP added below */
    set.add(HAIRSP);
    set.add(RLM+1);
    set.add(INHSWAP);
    set.add(NOMDIG+1);
    set.add(ZWNBSP);
    set.add(ZWNBSP+1);

    /* add no-break spaces for u_isWhitespace() what was not added above */
    set.add(NBSP);
    set.add(NBSP+1);
    set.add(FIGURESP);
    set.add(FIGURESP+1);
    set.add(NNBSP);
    set.add(NNBSP+1);

    /* add for u_charDigitValue() */
    // TODO remove when UCharacter.getHanNumericValue() is changed to just return
    // Unicode numeric values
    set.add(0x3007);
    set.add(0x3008);
    set.add(0x4e00);
    set.add(0x4e01);
    set.add(0x4e8c);
    set.add(0x4e8d);
    set.add(0x4e09);
    set.add(0x4e0a);
    set.add(0x56db);
    set.add(0x56dc);
    set.add(0x4e94);
    set.add(0x4e95);
    set.add(0x516d);
    set.add(0x516e);
    set.add(0x4e03);
    set.add(0x4e04);
    set.add(0x516b);
    set.add(0x516c);
    set.add(0x4e5d);
    set.add(0x4e5e);

    /* add for u_digit() */
    set.add(U_a);
    set.add(U_z+1);
    set.add(U_A);
    set.add(U_Z+1);
    set.add(U_FW_a);
    set.add(U_FW_z+1);
    set.add(U_FW_A);
    set.add(U_FW_Z+1);

    /* add for u_isxdigit() */
    set.add(U_f+1);
    set.add(U_F+1);
    set.add(U_FW_f+1);
    set.add(U_FW_F+1);

    /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
    set.add(WJ); /* range WJ..NOMDIG */
    set.add(0xfff0);
    set.add(0xfffb+1);
    set.add(0xe0000);
    set.add(0xe0fff+1);

    /* add for UCHAR_GRAPHEME_BASE and others */
    set.add(CGJ);
    set.add(CGJ+1);

    return set; // for chaining
}
 
Example 20
Source File: UCaseProps.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
/**
 * Maps the string to single code points and adds the associated case closure
 * mappings.
 * The string is mapped to code points if it is their full case folding string.
 * In other words, this performs a reverse full case folding and then
 * adds the case closure items of the resulting code points.
 * If the string is found and its closure applied, then
 * the string itself is added as well as part of its code points' closure.
 *
 * @return true if the string was found
 */
public final boolean addStringCaseClosure(String s, UnicodeSet set) {
    int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;

    if(unfold==null || s==null) {
        return false; /* no reverse case folding data, or no string */
    }
    length=s.length();
    if(length<=1) {
        /* the string is too short to find any match */
        /*
         * more precise would be:
         * if(!u_strHasMoreChar32Than(s, length, 1))
         * but this does not make much practical difference because
         * a single supplementary code point would just not be found
         */
        return false;
    }

    unfoldRows=unfold[UNFOLD_ROWS];
    unfoldRowWidth=unfold[UNFOLD_ROW_WIDTH];
    unfoldStringWidth=unfold[UNFOLD_STRING_WIDTH];
    //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;

    if(length>unfoldStringWidth) {
        /* the string is too long to find any match */
        return false;
    }

    /* do a binary search for the string */
    start=0;
    limit=unfoldRows;
    while(start<limit) {
        i=(start+limit)/2;
        unfoldOffset=((i+1)*unfoldRowWidth); // +1 to skip the header values above
        result=strcmpMax(s, unfoldOffset, unfoldStringWidth);

        if(result==0) {
            /* found the string: add each code point, and its case closure */
            int c;

            for(i=unfoldStringWidth; i<unfoldRowWidth && unfold[unfoldOffset+i]!=0; i+=UTF16.getCharCount(c)) {
                c=UTF16.charAt(unfold, unfoldOffset, unfold.length, i);
                set.add(c);
                addCaseClosure(c, set);
            }
            return true;
        } else if(result<0) {
            limit=i;
        } else /* result>0 */ {
            start=i+1;
        }
    }

    return false; /* string not found */
}