com.ibm.icu.text.UnicodeSetIterator Java Examples

The following examples show how to use com.ibm.icu.text.UnicodeSetIterator. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CollationDataBuilder.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
void suppressContractions(UnicodeSet set) {
    if(set.isEmpty()) { return; }
    UnicodeSetIterator iter = new UnicodeSetIterator(set);
    while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
        int c = iter.codepoint;
        int ce32 = trie.get(c);
        if(ce32 == Collation.FALLBACK_CE32) {
            ce32 = base.getFinalCE32(base.getCE32(c));
            if(Collation.ce32HasContext(ce32)) {
                ce32 = copyFromBaseCE32(c, ce32, false /* without context */);
                trie.set(c, ce32);
            }
        } else if(isBuilderContextCE32(ce32)) {
            ce32 = getConditionalCE32ForCE32(ce32).ce32;
            // Simply abandon the list of ConditionalCE32.
            // The caller will copy this builder in the end,
            // eliminating unreachable data.
            trie.set(c, ce32);
            contextChars.remove(c);
        }
    }
    modified = true;
}
 
Example #2
Source File: CollationDataBuilder.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
protected void setDigitTags() {
    UnicodeSet digits = new UnicodeSet("[:Nd:]");
    UnicodeSetIterator iter = new UnicodeSetIterator(digits);
    while(iter.next()) {
        assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
        int c = iter.codepoint;
        int ce32 = trie.get(c);
        if(ce32 != Collation.FALLBACK_CE32 && ce32 != Collation.UNASSIGNED_CE32) {
            int index = addCE32(ce32);
            if(index > Collation.MAX_INDEX) {
                throw new IndexOutOfBoundsException("too many mappings");
                // BufferOverflowException is a better fit
                // but cannot be constructed with a message string.
            }
            ce32 = Collation.makeCE32FromTagIndexAndLength(
                    Collation.DIGIT_TAG, index, UCharacter.digit(c));  // u_charDigitValue(c)
            trie.set(c, ce32);
        }
    }
}
 
Example #3
Source File: CollationDataBuilder.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
protected void buildContexts() {
    // Ignore abandoned lists and the cached builtCE32,
    // and build all contexts from scratch.
    contexts.setLength(0);
    UnicodeSetIterator iter = new UnicodeSetIterator(contextChars);
    while(iter.next()) {
        assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
        int c = iter.codepoint;
        int ce32 = trie.get(c);
        if(!isBuilderContextCE32(ce32)) {
            throw new AssertionError("Impossible: No context data for c in contextChars.");
        }
        ConditionalCE32 cond = getConditionalCE32ForCE32(ce32);
        ce32 = buildContext(cond);
        trie.set(c, ce32);
    }
}
 
Example #4
Source File: CollationBuilder.java    From fitnotifications with Apache License 2.0 6 votes vote down vote up
private void closeOverComposites() {
    String prefix = "";  // empty
    UnicodeSetIterator iter = new UnicodeSetIterator(COMPOSITES);
    while(iter.next()) {
        assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
        String nfdString = nfd.getDecomposition(iter.codepoint);
        cesLength = dataBuilder.getCEs(nfdString, ces, 0);
        if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
            // Too many CEs from the decomposition (unusual), ignore this composite.
            // We could add a capacity parameter to getCEs() and reallocate if necessary.
            // However, this can only really happen in contrived cases.
            continue;
        }
        String composite = iter.getString();
        addIfDifferent(prefix, composite, ces, cesLength, Collation.UNASSIGNED_CE32);
    }
}
 
Example #5
Source File: UTR30DataFileGenerator.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
private static void expandSingleRule(StringBuilder builder, String leftHandSide, String rightHandSide) {
    UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE);
    boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches();
    for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.nextRange(); ) {
        if (it.codepoint != UnicodeSetIterator.IS_STRING) {
            if (numericValue) {
                for (int cp = it.codepoint; cp <= it.codepointEnd; ++cp) {
                    builder.append(String.format(Locale.ROOT, "%04X", cp)).append('>');
                    builder.append(String.format(Locale.ROOT, "%04X", 0x30 + UCharacter.getNumericValue(cp)));
                    builder.append("   # ").append(UCharacter.getName(cp));
                    builder.append("\n");
                }
            } else {
                builder.append(String.format(Locale.ROOT, "%04X", it.codepoint));
                if (it.codepointEnd > it.codepoint) {
                    builder.append("..").append(String.format(Locale.ROOT, "%04X", it.codepointEnd));
                }
                builder.append('>').append(rightHandSide).append("\n");
            }
        } else {
            logger.error("ERROR: String '" + it.getString() + "' found in UnicodeSet");
        }
    }
}
 
Example #6
Source File: CollationDataBuilder.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
void optimize(UnicodeSet set) {
    if(set.isEmpty()) { return; }
    UnicodeSetIterator iter = new UnicodeSetIterator(set);
    while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
        int c = iter.codepoint;
        int ce32 = trie.get(c);
        if(ce32 == Collation.FALLBACK_CE32) {
            ce32 = base.getFinalCE32(base.getCE32(c));
            ce32 = copyFromBaseCE32(c, ce32, true);
            trie.set(c, ce32);
        }
    }
    modified = true;
}
 
Example #7
Source File: CollationDataBuilder.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
protected void clearContexts() {
    contexts.setLength(0);
    UnicodeSetIterator iter = new UnicodeSetIterator(contextChars);
    while(iter.next()) {
        assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
        int ce32 = trie.get(iter.codepoint);
        assert(isBuilderContextCE32(ce32));
        getConditionalCE32ForCE32(ce32).builtCE32 = Collation.NO_CE32;
    }
}
 
Example #8
Source File: GenerateUTR30DataFiles.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private static void expandSingleRule
    (StringBuilder builder, String leftHandSide, String rightHandSide)
    throws IllegalArgumentException {
  UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE);
  boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches();
  for (UnicodeSetIterator it = new UnicodeSetIterator(set) ; it.nextRange() ; ) {
    if (it.codepoint != UnicodeSetIterator.IS_STRING) {
      if (numericValue) {
        for (int cp = it.codepoint ; cp <= it.codepointEnd ; ++cp) {
          builder.append(String.format(Locale.ROOT, "%04X", cp)).append('>');
          builder.append(String.format(Locale.ROOT, "%04X", 0x30 + UCharacter.getNumericValue(cp)));
          builder.append("   # ").append(UCharacter.getName(cp));
          builder.append("\n");
        }
      } else {
        builder.append(String.format(Locale.ROOT, "%04X", it.codepoint));
        if (it.codepointEnd > it.codepoint) {
          builder.append("..").append(String.format(Locale.ROOT, "%04X", it.codepointEnd));
        }
        builder.append('>').append(rightHandSide).append("\n");
      }
    } else {
      System.err.println("ERROR: String '" + it.getString() + "' found in UnicodeSet");
      System.exit(1);
    }
  }
}
 
Example #9
Source File: CollationBuilder.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
private void addTailComposites(CharSequence nfdPrefix, CharSequence nfdString) {
    // Look for the last starter in the NFD string.
    int lastStarter;
    int indexAfterLastStarter = nfdString.length();
    for(;;) {
        if(indexAfterLastStarter == 0) { return; }  // no starter at all
        lastStarter = Character.codePointBefore(nfdString, indexAfterLastStarter);
        if(nfd.getCombiningClass(lastStarter) == 0) { break; }
        indexAfterLastStarter -= Character.charCount(lastStarter);
    }
    // No closure to Hangul syllables since we decompose them on the fly.
    if(Hangul.isJamoL(lastStarter)) { return; }

    // Are there any composites whose decomposition starts with the lastStarter?
    // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.
    // We might find some more equivalent mappings here if it did.
    UnicodeSet composites = new UnicodeSet();
    if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }

    StringBuilder newNFDString = new StringBuilder(), newString = new StringBuilder();
    long[] newCEs = new long[Collation.MAX_EXPANSION_LENGTH];
    UnicodeSetIterator iter = new UnicodeSetIterator(composites);
    while(iter.next()) {
        assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
        int composite = iter.codepoint;
        String decomp = nfd.getDecomposition(composite);
        if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,
                newNFDString, newString)) {
            continue;
        }
        int newCEsLength = dataBuilder.getCEs(nfdPrefix, newNFDString, newCEs, 0);
        if(newCEsLength > Collation.MAX_EXPANSION_LENGTH) {
            // Ignore mappings that we cannot store.
            continue;
        }
        // Note: It is possible that the newCEs do not make use of the mapping
        // for which we are adding the tail composites, in which case we might be adding
        // unnecessary mappings.
        // For example, when we add tail composites for ae^ (^=combining circumflex),
        // UCA discontiguous-contraction matching does not find any matches
        // for ae_^ (_=any combining diacritic below) *unless* there is also
        // a contraction mapping for ae.
        // Thus, if there is no ae contraction, then the ae^ mapping is ignored
        // while fetching the newCEs for ae_^.
        // TODO: Try to detect this effectively.
        // (Alternatively, print a warning when prefix contractions are missing.)

        // We do not need an explicit mapping for the NFD strings.
        // It is fine if the NFD input collates like this via a sequence of mappings.
        // It also saves a little bit of space, and may reduce the set of characters with contractions.
        int ce32 = addIfDifferent(nfdPrefix, newString,
                                      newCEs, newCEsLength, Collation.UNASSIGNED_CE32);
        if(ce32 != Collation.UNASSIGNED_CE32) {
            // was different, was added
            addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32);
        }
    }
}