com.ibm.icu.text.UnicodeSet Java Examples

The following examples show how to use com.ibm.icu.text.UnicodeSet. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Compiler.java    From tcl-regex-java with Apache License 2.0 6 votes vote down vote up
/**
 * dovec - fill in arcs for each element of a cvec
 * all kinds of MCCE complexity removed.
 */
private void dovec(UnicodeSet set, State lp, State rp) throws RegexException {

    int rangeCount = set.getRangeCount();
    for (int rx = 0; rx < rangeCount; rx++) {
        int rangeStart = set.getRangeStart(rx);
        int rangeEnd = set.getRangeEnd(rx);
        /*
         * Note: ICU operates in UTF-32 here, and the ColorMap is happy to play along.
         */
        if (LOG.isDebugEnabled() && IS_DEBUG) {
            LOG.debug(String.format("%s %d %4x %4x", set, rx, rangeStart, rangeEnd));
        }
        //TODO: this arc is probably redundant.
        if (rangeStart == rangeEnd) {
            nfa.newarc(PLAIN, cm.subcolor(rangeStart), lp, rp);
        }
        cm.subrange(rangeStart, rangeEnd, lp, rp);
    }
}
 
Example #2
Source File: Normalizer2Impl.java    From trekarta with GNU General Public License v3.0 6 votes vote down vote up
public void addLcccChars(UnicodeSet set) {
    int start = 0;
    CodePointMap.Range range = new CodePointMap.Range();
    while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT,
            null, range)) {
        int end = range.getEnd();
        int norm16 = range.getValue();
        if (norm16 > MIN_NORMAL_MAYBE_YES && norm16 != JAMO_VT) {
            set.add(start, end);
        } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) {
            int fcd16 = getFCD16(start);
            if (fcd16 > 0xff) { set.add(start, end); }
        }
        start = end + 1;
    }
}
 
Example #3
Source File: Normalizer2Impl.java    From trekarta with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Returns true if there are characters whose decomposition starts with c.
 * If so, then the set is cleared and then filled with those characters.
 * <b>{@link #ensureCanonIterData()} must have been called before this method,
 * or else this method will crash.</b>
 * @param c A Unicode code point.
 * @param set A UnicodeSet to receive the characters whose decompositions
 *        start with c, if there are any.
 * @return true if there are characters whose decomposition starts with c.
 */
public boolean getCanonStartSet(int c, UnicodeSet set) {
    int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
    if(canonValue==0) {
        return false;
    }
    set.clear();
    int value=canonValue&CANON_VALUE_MASK;
    if((canonValue&CANON_HAS_SET)!=0) {
        set.addAll(canonStartSets.get(value));
    } else if(value!=0) {
        set.add(value);
    }
    if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
        int norm16 = getRawNorm16(c);
        if(norm16==JAMO_L) {
            int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
            set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
        } else {
            addComposites(getCompositionsList(norm16), set);
        }
    }
    return true;
}
 
Example #4
Source File: Normalizer2Impl.java    From trekarta with GNU General Public License v3.0 6 votes vote down vote up
/**
 * @param list some character's compositions list
 * @param set recursively receives the composites from these compositions
 */
private void addComposites(int list, UnicodeSet set) {
    int firstUnit, compositeAndFwd;
    do {
        firstUnit=maybeYesCompositions.charAt(list);
        if((firstUnit&COMP_1_TRIPLE)==0) {
            compositeAndFwd=maybeYesCompositions.charAt(list+1);
            list+=2;
        } else {
            compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
                            maybeYesCompositions.charAt(list+2);
            list+=3;
        }
        int composite=compositeAndFwd>>1;
        if((compositeAndFwd&1)!=0) {
            addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set);
        }
        set.add(composite);
    } while((firstUnit&COMP_1_LAST_TUPLE)==0);
}
 
Example #5
Source File: UCaseProps.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
public final void addPropertyStarts(UnicodeSet set) {
    /* add the start code point of each same-value range of the trie */
    Iterator<Trie2.Range> trieIterator=trie.iterator();
    Trie2.Range range;
    while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
        set.add(range.startCodePoint);
    }

    /* add code points with hardcoded properties, plus the ones following them */

    /* (none right now, see comment below) */

    /*
     * Omit code points with hardcoded specialcasing properties
     * because we do not build property UnicodeSets for them right now.
     */
}
 
Example #6
Source File: Normalizer2Impl.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) {
    /* add the start code point to the USet */
    set.add(start);
    if(start!=end && isAlgorithmicNoNo(value)) {
        // Range of code points with same-norm16-value algorithmic decompositions.
        // They might have different non-zero FCD16 values.
        int prevFCD16=getFCD16(start);
        while(++start<=end) {
            int fcd16=getFCD16(start);
            if(fcd16!=prevFCD16) {
                set.add(start);
                prevFCD16=fcd16;
            }
        }
    }
}
 
Example #7
Source File: ICUNormalizer2CharFilterFactory.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Creates a new ICUNormalizer2CharFilterFactory */
public ICUNormalizer2CharFilterFactory(Map<String,String> args) {
  super(args);
  String form = get(args, "form", "nfkc_cf");
  String mode = get(args, "mode", Arrays.asList("compose", "decompose"), "compose");
  Normalizer2 normalizer = Normalizer2.getInstance
      (null, form, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
  
  String filter = get(args, "filter");
  if (filter != null) {
    UnicodeSet set = new UnicodeSet(filter);
    if (!set.isEmpty()) {
      set.freeze();
      normalizer = new FilteredNormalizer2(normalizer, set);
    }
  }
  if (!args.isEmpty()) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
  this.normalizer = normalizer;
}
 
Example #8
Source File: ICUFoldingFilterFactory.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Creates a new ICUFoldingFilterFactory */
public ICUFoldingFilterFactory(Map<String,String> args) {
  super(args);

  Normalizer2 normalizer = ICUFoldingFilter.NORMALIZER;
  String filter = get(args, "filter");
  if (filter != null) {
    UnicodeSet set = new UnicodeSet(filter);
    if (!set.isEmpty()) {
      set.freeze();
      normalizer = new FilteredNormalizer2(normalizer, set);
    }
  }
  if (!args.isEmpty()) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
  this.normalizer = normalizer;
}
 
Example #9
Source File: ICUTransformFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Create a new ICUTransformFilter that transforms text on the given stream.
 * 
 * @param input {@link TokenStream} to filter.
 * @param transform Transliterator to transform the text.
 */
@SuppressWarnings("deprecation")
public ICUTransformFilter(TokenStream input, Transliterator transform) {
  super(input);
  this.transform = transform;

  /* 
   * This is cheating, but speeds things up a lot.
   * If we wanted to use pkg-private APIs we could probably do better.
   */
  if (transform.getFilter() == null && transform instanceof com.ibm.icu.text.RuleBasedTransliterator) {
    final UnicodeSet sourceSet = transform.getSourceSet();
    if (sourceSet != null && !sourceSet.isEmpty())
      transform.setFilter(sourceSet);
  }
}
 
Example #10
Source File: Normalizer2Impl.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
/**
 * Returns true if there are characters whose decomposition starts with c.
 * If so, then the set is cleared and then filled with those characters.
 * <b>{@link #ensureCanonIterData()} must have been called before this method,
 * or else this method will crash.</b>
 * @param c A Unicode code point.
 * @param set A UnicodeSet to receive the characters whose decompositions
 *        start with c, if there are any.
 * @return true if there are characters whose decomposition starts with c.
 */
public boolean getCanonStartSet(int c, UnicodeSet set) {
    int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
    if(canonValue==0) {
        return false;
    }
    set.clear();
    int value=canonValue&CANON_VALUE_MASK;
    if((canonValue&CANON_HAS_SET)!=0) {
        set.addAll(canonStartSets.get(value));
    } else if(value!=0) {
        set.add(value);
    }
    if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
        int norm16=getNorm16(c);
        if(norm16==JAMO_L) {
            int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
            set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
        } else {
            addComposites(getCompositionsList(norm16), set);
        }
    }
    return true;
}
 
Example #11
Source File: Normalizer2Impl.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
/**
 * @param list some character's compositions list
 * @param set recursively receives the composites from these compositions
 */
private void addComposites(int list, UnicodeSet set) {
    int firstUnit, compositeAndFwd;
    do {
        firstUnit=maybeYesCompositions.charAt(list);
        if((firstUnit&COMP_1_TRIPLE)==0) {
            compositeAndFwd=maybeYesCompositions.charAt(list+1);
            list+=2;
        } else {
            compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
                            maybeYesCompositions.charAt(list+2);
            list+=3;
        }
        int composite=compositeAndFwd>>1;
        if((compositeAndFwd&1)!=0) {
            addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
        }
        set.add(composite);
    } while((firstUnit&COMP_1_LAST_TUPLE)==0);
}
 
Example #12
Source File: CollationDataBuilder.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
protected void setDigitTags() {
    UnicodeSet digits = new UnicodeSet("[:Nd:]");
    UnicodeSetIterator iter = new UnicodeSetIterator(digits);
    while(iter.next()) {
        assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
        int c = iter.codepoint;
        int ce32 = trie.get(c);
        if(ce32 != Collation.FALLBACK_CE32 && ce32 != Collation.UNASSIGNED_CE32) {
            int index = addCE32(ce32);
            if(index > Collation.MAX_INDEX) {
                throw new IndexOutOfBoundsException("too many mappings");
                // BufferOverflowException is a better fit
                // but cannot be constructed with a message string.
            }
            ce32 = Collation.makeCE32FromTagIndexAndLength(
                    Collation.DIGIT_TAG, index, UCharacter.digit(c));  // u_charDigitValue(c)
            trie.set(c, ce32);
        }
    }
}
 
Example #13
Source File: CollationDataBuilder.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
void suppressContractions(UnicodeSet set) {
    if(set.isEmpty()) { return; }
    UnicodeSetIterator iter = new UnicodeSetIterator(set);
    while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
        int c = iter.codepoint;
        int ce32 = trie.get(c);
        if(ce32 == Collation.FALLBACK_CE32) {
            ce32 = base.getFinalCE32(base.getCE32(c));
            if(Collation.ce32HasContext(ce32)) {
                ce32 = copyFromBaseCE32(c, ce32, false /* without context */);
                trie.set(c, ce32);
            }
        } else if(isBuilderContextCE32(ce32)) {
            ce32 = getConditionalCE32ForCE32(ce32).ce32;
            // Simply abandon the list of ConditionalCE32.
            // The caller will copy this builder in the end,
            // eliminating unreachable data.
            trie.set(c, ce32);
            contextChars.remove(c);
        }
    }
    modified = true;
}
 
Example #14
Source File: GenerateUTR30DataFiles.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private static void expandSingleRule
    (StringBuilder builder, String leftHandSide, String rightHandSide)
    throws IllegalArgumentException {
  UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE);
  boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches();
  for (UnicodeSetIterator it = new UnicodeSetIterator(set) ; it.nextRange() ; ) {
    if (it.codepoint != UnicodeSetIterator.IS_STRING) {
      if (numericValue) {
        for (int cp = it.codepoint ; cp <= it.codepointEnd ; ++cp) {
          builder.append(String.format(Locale.ROOT, "%04X", cp)).append('>');
          builder.append(String.format(Locale.ROOT, "%04X", 0x30 + UCharacter.getNumericValue(cp)));
          builder.append("   # ").append(UCharacter.getName(cp));
          builder.append("\n");
        }
      } else {
        builder.append(String.format(Locale.ROOT, "%04X", it.codepoint));
        if (it.codepointEnd > it.codepoint) {
          builder.append("..").append(String.format(Locale.ROOT, "%04X", it.codepointEnd));
        }
        builder.append('>').append(rightHandSide).append("\n");
      }
    } else {
      System.err.println("ERROR: String '" + it.getString() + "' found in UnicodeSet");
      System.exit(1);
    }
  }
}
 
Example #15
Source File: TestICUTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEmojiFromTheFuture() throws Exception {
  // pick an unassigned character with extended_pictographic
  int ch = new UnicodeSet("[[:Extended_Pictographic:]&[:Unassigned:]]").getRangeStart(0);
  String value = new String(Character.toChars(ch));
  // should analyze to emoji type
  BaseTokenStreamTestCase.assertAnalyzesTo(a, value,
      new String[] { value },
      new String[] { "<EMOJI>" });
  // shouldn't break in a sequence
  BaseTokenStreamTestCase.assertAnalyzesTo(a, value + '\u200D' + value,
      new String[] { value + '\u200D' + value  },
      new String[] { "<EMOJI>" });
}
 
Example #16
Source File: CharacterProperties.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Returns a frozen UnicodeSet for a binary property.
 * Throws an exception if the property number is not one for a binary property.
 *
 * <p>The returned set contains all code points for which the property is true.
 *
 * @param property {@link UProperty#BINARY_START}..{@link UProperty#BINARY_LIMIT}-1
 * @return the property as a set
 * @see UProperty
 * @see UCharacter#hasBinaryProperty
 * @stable ICU 63
 */
public static final UnicodeSet getBinaryPropertySet(int property) {
    if (property < 0 || UProperty.BINARY_LIMIT <= property) {
        throw new IllegalArgumentException("" + property +
                " is not a constant for a UProperty binary property");
    }
    synchronized(sets) {
        UnicodeSet set = sets[property];
        if (set == null) {
            sets[property] = set = makeSet(property);
        }
        return set;
    }
}
 
Example #17
Source File: Locale.java    From tcl-regex-java with Apache License 2.0 5 votes vote down vote up
/**
 * Return a UnicodeSet for a character class name.
 * It appears that the names that TCL accepts are also acceptable to ICU.
 *
 * @param cclassName class name
 * @param casefold whether to include casefolding
 * @return set
 */
public static UnicodeSet cclass(String cclassName, boolean casefold) throws RegexException {
    try {
        if (casefold) {
            return KNOWN_SETS_CI.get(cclassName);
        } else {
            return KNOWN_SETS_CS.get(cclassName);
        }
    } catch (ExecutionException e) {
        Throwables.propagateIfInstanceOf(e.getCause(), RegexException.class);
        throw new RegexRuntimeException(e.getCause());
    }
}
 
Example #18
Source File: LocaleData.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Returns the set of exemplar characters for a locale.
 *
 * @param options   Bitmask for options to apply to the exemplar pattern.
 *                  Specify zero to retrieve the exemplar set as it is
 *                  defined in the locale data.  Specify
 *                  UnicodeSet.CASE to retrieve a case-folded exemplar
 *                  set.  See {@link UnicodeSet#applyPattern(String,
 *                  int)} for a complete list of valid options.  The
 *                  IGNORE_SPACE bit is always set, regardless of the
 *                  value of 'options'.
 * @param extype    The type of exemplar set to be retrieved,
 *                  ES_STANDARD, ES_INDEX, ES_AUXILIARY, or ES_PUNCTUATION
 * @return          The set of exemplar characters for the given locale.
 *                  If there is nothing available for the locale,
 *                  then null is returned if {@link #getNoSubstitute()} is true, otherwise the
 *                  root value is returned (which may be UnicodeSet.EMPTY).
 * @exception       RuntimeException if the extype is invalid.
 * @stable ICU 3.4
 */
public UnicodeSet getExemplarSet(int options, int extype) {
    String [] exemplarSetTypes = {
            "ExemplarCharacters",
            "AuxExemplarCharacters",
            "ExemplarCharactersIndex",
            "ExemplarCharactersCurrency",
            "ExemplarCharactersPunctuation"
    };

    if (extype == ES_CURRENCY) {
        // currency symbol exemplar is no longer available
        return noSubstitute ? null : UnicodeSet.EMPTY;
    }

    try{
        final String aKey = exemplarSetTypes[extype]; // will throw an out-of-bounds exception
        ICUResourceBundle stringBundle = (ICUResourceBundle) bundle.get(aKey);

        if (noSubstitute && !bundle.isRoot() && stringBundle.isRoot()) {
            return null;
        }
        String unicodeSetPattern = stringBundle.getString();
        return new UnicodeSet(unicodeSetPattern, UnicodeSet.IGNORE_SPACE | options);
    } catch (ArrayIndexOutOfBoundsException aiooe) {
        throw new IllegalArgumentException(aiooe);
    } catch (Exception ex){
        return noSubstitute ? null : UnicodeSet.EMPTY;
    }
}
 
Example #19
Source File: UnicodeSetStringSpan.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Does the set contain the next code point?
 * If so, return its length; otherwise return its negative length.
 */
static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
    char c = s.charAt(start);
    if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
        char c2 = s.charAt(start + 1);
        if (com.ibm.icu.text.UTF16.isTrailSurrogate(c2)) {
            int supplementary = Character.toCodePoint(c, c2);
            return set.contains(supplementary) ? 2 : -2;
        }
    }
    return set.contains(c) ? 1 : -1;
}
 
Example #20
Source File: UnicodeSetStringSpan.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
    char c = s.charAt(length - 1);
    if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
        char c2 = s.charAt(length - 2);
        if (com.ibm.icu.text.UTF16.isLeadSurrogate(c2)) {
            int supplementary = Character.toCodePoint(c2, c);
            return set.contains(supplementary) ? 2 : -2;
        }
    }
    return set.contains(c) ? 1 : -1;
}
 
Example #21
Source File: CharacterProperties.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
private static UnicodeSet makeSet(int property) {
    UnicodeSet set = new UnicodeSet();
    UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(property);
    int numRanges = inclusions.getRangeCount();
    int startHasProperty = -1;

    for (int i = 0; i < numRanges; ++i) {
        int rangeEnd = inclusions.getRangeEnd(i);
        for (int c = inclusions.getRangeStart(i); c <= rangeEnd; ++c) {
            // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
            if (UCharacter.hasBinaryProperty(c, property)) {
                if (startHasProperty < 0) {
                    // Transition from false to true.
                    startHasProperty = c;
                }
            } else if (startHasProperty >= 0) {
                // Transition from true to false.
                set.add(startHasProperty, c - 1);
                startHasProperty = -1;
            }
        }
    }
    if (startHasProperty >= 0) {
        set.add(startHasProperty, 0x10FFFF);
    }

    return set.freeze();
}
 
Example #22
Source File: CharacterPropertiesImpl.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Returns a mutable UnicodeSet -- do not modify!
 */
public static synchronized UnicodeSet getInclusionsForProperty(int prop) {
    if (UProperty.INT_START <= prop && prop < UProperty.INT_LIMIT) {
        return getIntPropInclusions(prop);
    } else {
        int src = UCharacterProperty.INSTANCE.getSource(prop);
        return getInclusionsForSource(src);
    }
}
 
Example #23
Source File: Normalizer2Impl.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
public void addPropertyStarts(UnicodeSet set) {
    // Add the start code point of each same-value range of the trie.
    int start = 0;
    CodePointMap.Range range = new CodePointMap.Range();
    while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT,
            null, range)) {
        int end = range.getEnd();
        int value = range.getValue();
        set.add(start);
        if (start != end && isAlgorithmicNoNo(value) &&
                (value & DELTA_TCCC_MASK) > DELTA_TCCC_1) {
            // Range of code points with same-norm16-value algorithmic decompositions.
            // They might have different non-zero FCD16 values.
            int prevFCD16 = getFCD16(start);
            while (++start <= end) {
                int fcd16 = getFCD16(start);
                if (fcd16 != prevFCD16) {
                    set.add(start);
                    prevFCD16 = fcd16;
                }
            }
        }
        start = end + 1;
    }

    /* add Hangul LV syllables and LV+1 because of skippables */
    for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) {
        set.add(c);
        set.add(c+1);
    }
    set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
}
 
Example #24
Source File: UCharacterProperty.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
public void upropsvec_addPropertyStarts(UnicodeSet set) {
    /* add the start code point of each same-value range of the properties vectors trie */
    if(m_additionalColumnsCount_>0) {
        /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
        Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
        Trie2.Range range;
        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
            set.add(range.startCodePoint);
        }
    }
}
 
Example #25
Source File: ContractionsAndExpansions.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
void addStrings(int start, int end, UnicodeSet set) {
    if (set == null) {
        return;
    }
    StringBuilder s = new StringBuilder(unreversedPrefix);
    do {
        s.appendCodePoint(start);
        if (suffix != null) {
            s.append(suffix);
        }
        set.add(s);
        s.setLength(unreversedPrefix.length());
    } while (++start <= end);
}
 
Example #26
Source File: ContractionsAndExpansions.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
private void enumCnERange(int start, int end, int ce32, ContractionsAndExpansions cne) {
    if (cne.checkTailored == 0) {
        // There is no tailoring.
        // No need to collect nor check the tailored set.
    } else if (cne.checkTailored < 0) {
        // Collect the set of code points with mappings in the tailoring data.
        if (ce32 == Collation.FALLBACK_CE32) {
            return; // fallback to base, not tailored
        } else {
            cne.tailored.add(start, end);
        }
        // checkTailored > 0: Exclude tailored ranges from the base data enumeration.
    } else if (start == end) {
        if (cne.tailored.contains(start)) {
            return;
        }
    } else if (cne.tailored.containsSome(start, end)) {
        if (cne.ranges == null) {
            cne.ranges = new UnicodeSet();
        }
        cne.ranges.set(start, end).removeAll(cne.tailored);
        int count = cne.ranges.getRangeCount();
        for (int i = 0; i < count; ++i) {
            cne.handleCE32(cne.ranges.getRangeStart(i), cne.ranges.getRangeEnd(i), ce32);
        }
    }
    cne.handleCE32(start, end, ce32);
}
 
Example #27
Source File: UCharacterProperty.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
public void upropsvec_addPropertyStarts(UnicodeSet set) {
    /* add the start code point of each same-value range of the properties vectors trie */
    if(m_additionalColumnsCount_>0) {
        /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
        Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
        Trie2.Range range;
        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
            set.add(range.startCodePoint);
        }
    }
}
 
Example #28
Source File: Locale.java    From tcl-regex-java with Apache License 2.0 5 votes vote down vote up
public UnicodeSet load(String cclass) throws RegexException {
    String className = "[:" + cclass + ":]";
    try {
        return new UnicodeSet(className, 0);
    }  catch (IllegalArgumentException iae) {
        throw new RegexException("Invalid character class name " + cclass);
    }
}
 
Example #29
Source File: StringTokenizer.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
/**
 * {@icu} Constructs a string tokenizer for the specified string. All 
 * characters in the delim argument are the delimiters for separating 
 * tokens. 
 * <p>If the returnDelims flag is false, the delimiter characters are 
 * skipped and only serve as separators between tokens.
 * <p>If the returnDelims flag is true, then the delimiter characters 
 * are also returned as tokens.  If coalescedelims is true, one token
 * is returned for each run of delimiter characters, otherwise one
 * token is returned per delimiter.  Since surrogate pairs can be
 * delimiters, the returned token might be two chars in length.
 * @param str a string to be parsed.
 * @param delim the delimiters.
 * @param returndelims flag indicating whether to return the delimiters 
 *        as tokens.
 * @param coalescedelims flag indicating whether to return a run of 
 *        delimiters as a single token or as one token per delimiter.  
 *        This only takes effect if returndelims is true.
 * @exception NullPointerException if str is null
 * @internal
 * @deprecated This API is ICU internal only.
 */
@Deprecated
public StringTokenizer(String str, UnicodeSet delim, boolean returndelims, boolean coalescedelims)
{
    m_source_ = str;
    m_length_ = str.length();
    if (delim == null) {
        m_delimiters_ = EMPTY_DELIMITER_;
    }
    else {
        m_delimiters_ = delim;   
    }
    m_returnDelimiters_ = returndelims;
    m_coalesceDelimiters_ = coalescedelims;
    m_tokenOffset_ = -1;
    m_tokenSize_ = -1;
    if (m_length_ == 0) {
        // string length 0, no tokens
        m_nextOffset_ = -1;
    }
    else {
        m_nextOffset_ = 0;
        if (!returndelims) {
            m_nextOffset_ = getNextNonDelimiter(0);
        }
    }
}
 
Example #30
Source File: CharacterPropertiesImpl.java    From trekarta with GNU General Public License v3.0 5 votes vote down vote up
private static UnicodeSet getIntPropInclusions(int prop) {
    assert(UProperty.INT_START <= prop && prop < UProperty.INT_LIMIT);
    int inclIndex = UCharacterProperty.SRC_COUNT + prop - UProperty.INT_START;
    if (inclusions[inclIndex] != null) {
        return inclusions[inclIndex];
    }
    int src = UCharacterProperty.INSTANCE.getSource(prop);
    UnicodeSet incl = getInclusionsForSource(src);

    UnicodeSet intPropIncl = new UnicodeSet(0, 0);
    int numRanges = incl.getRangeCount();
    int prevValue = 0;
    for (int i = 0; i < numRanges; ++i) {
        int rangeEnd = incl.getRangeEnd(i);
        for (int c = incl.getRangeStart(i); c <= rangeEnd; ++c) {
            // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
            int value = UCharacter.getIntPropertyValue(c, prop);
            if (value != prevValue) {
                intPropIncl.add(c);
                prevValue = value;
            }
        }
    }

    // Compact for caching.
    return inclusions[inclIndex] = intPropIncl.compact();
}