com.ibm.icu.lang.UScript Java Examples

The following examples show how to use com.ibm.icu.lang.UScript. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: SpoofChecker.java From fitnotifications with Apache License 2.0

6 votes

public void appendStringTo(StringBuilder sb) {
    sb.append("{ ");
    if (isEmpty()) {
        sb.append("- ");
    } else if (isFull()) {
        sb.append("* ");
    } else {
        for (int script = 0; script < UScript.CODE_LIMIT; script++) {
            if (get(script)) {
                sb.append(UScript.getShortName(script));
                sb.append(" ");
            }
        }
    }
    sb.append("}");
}

Example #2

Source File: CollationFastLatinBuilder.java From fitnotifications with Apache License 2.0

6 votes

private boolean loadGroups(CollationData data) {
    headerLength = 1 + NUM_SPECIAL_GROUPS;
    int r0 = (CollationFastLatin.VERSION << 8) | headerLength;
    result.append((char)r0);
    // The first few reordering groups should be special groups
    // (space, punct, ..., digit) followed by Latn, then Grek and other scripts.
    for(int i = 0; i < NUM_SPECIAL_GROUPS; ++i) {
        lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(Collator.ReorderCodes.FIRST + i);
        if(lastSpecialPrimaries[i] == 0) {
            // missing data
            return false;
        }
        result.append(0);  // reserve a slot for this group
    }

    firstDigitPrimary = data.getFirstPrimaryForGroup(Collator.ReorderCodes.DIGIT);
    firstLatinPrimary = data.getFirstPrimaryForGroup(UScript.LATIN);
    lastLatinPrimary = data.getLastPrimaryForGroup(UScript.LATIN);
    if(firstDigitPrimary == 0 || firstLatinPrimary == 0) {
        // missing data
        return false;
    }
    return true;
}

Example #3

Source File: ScriptIterator.java From lucene-solr with Apache License 2.0

6 votes

/** fast version of UScript.getScript(). Basic Latin is an array lookup */
private int getScript(int codepoint) {
  if (0 <= codepoint && codepoint < basicLatin.length) {
    return basicLatin[codepoint];
  } else {
    int script = UScript.getScript(codepoint);
    if (combineCJ) {
      if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
        return UScript.JAPANESE;
      } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
        // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
        // they are treated as punctuation. we currently have no cleaner way to fix this!
        return UScript.LATIN; 
      } else {
        return script;
      }
    } else {
      return script;
    }
  }
}

Example #4

Source File: SpoofChecker.java From fitnotifications with Apache License 2.0

6 votes

/**
 * Computes the resolved script set for a string, omitting characters having the specified script. If
 * UScript.CODE_LIMIT is passed as the second argument, all characters are included.
 */
private void getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result) {
    result.setAll();

    ScriptSet temp = new ScriptSet();
    for (int utf16Offset = 0; utf16Offset < input.length();) {
        int codePoint = Character.codePointAt(input, utf16Offset);
        utf16Offset += Character.charCount(codePoint);

        // Compute the augmented script set for the character
        getAugmentedScriptSet(codePoint, temp);

        // Intersect the augmented script set with the resolved script set, but only if the character doesn't
        // have the script specified in the function call
        if (script == UScript.CODE_LIMIT || !temp.get(script)) {
            result.and(temp);
        }
    }
}

Example #5

Source File: DefaultICUTokenizerConfig.java From lucene-solr with Apache License 2.0

6 votes

@Override
public String getType(int script, int ruleStatus) {
  switch (ruleStatus) {
    case RuleBasedBreakIterator.WORD_IDEO:
      return WORD_IDEO;
    case RuleBasedBreakIterator.WORD_KANA:
      return script == UScript.HIRAGANA ? WORD_HIRAGANA : WORD_KATAKANA;
    case RuleBasedBreakIterator.WORD_LETTER:
      return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
    case RuleBasedBreakIterator.WORD_NUMBER:
      return WORD_NUMBER;
    case EMOJI_SEQUENCE_STATUS:
      return WORD_EMOJI;
    default: /* some other custom code */
      return "<OTHER>";
  }
}

Example #6

Source File: ScriptIterator.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

/**
 * fast version of UScript.getScript(). Basic Latin is an array lookup
 */
private int getScript(int codepoint) {
    if (0 <= codepoint && codepoint < basicLatin.length) {
        return basicLatin[codepoint];
    } else {
        int script = UScript.getScript(codepoint);
        if (combineCJ) {
            if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
                return UScript.JAPANESE;
            } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
                // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
                // they are treated as punctuation. we currently have no cleaner way to fix this!
                return UScript.LATIN;
            } else {
                return script;
            }
        } else {
            return script;
        }
    }
}

Example #7

Source File: SpoofChecker.java From fitnotifications with Apache License 2.0

5 votes

/**
 * Computes the augmented script set for a code point, according to UTS 39 section 5.1.
 */
private static void getAugmentedScriptSet(int codePoint, ScriptSet result) {
    result.clear();
    UScript.getScriptExtensions(codePoint, result);

    // Section 5.1 step 1
    if (result.get(UScript.HAN)) {
        result.set(UScript.HAN_WITH_BOPOMOFO);
        result.set(UScript.JAPANESE);
        result.set(UScript.KOREAN);
    }
    if (result.get(UScript.HIRAGANA)) {
        result.set(UScript.JAPANESE);
    }
    if (result.get(UScript.KATAKANA)) {
        result.set(UScript.JAPANESE);
    }
    if (result.get(UScript.HANGUL)) {
        result.set(UScript.KOREAN);
    }
    if (result.get(UScript.BOPOMOFO)) {
        result.set(UScript.HAN_WITH_BOPOMOFO);
    }

    // Section 5.1 step 2
    if (result.get(UScript.COMMON) || result.get(UScript.INHERITED)) {
        result.setAll();
    }
}

Example #8

Source File: IdentifierInfo.java From fitnotifications with Apache License 2.0

5 votes

/**
 * Parse a text list of scripts into a BitSet.
 * 
 * @param scriptsString the string to be parsed
 * @return BitSet of UScript values.
 * @internal
 * @deprecated This API is ICU internal only.
 */
@Deprecated
public static BitSet parseScripts(String scriptsString) {
    BitSet result = new BitSet();
    for (String item : scriptsString.trim().split(",?\\s+")) {
        if (item.length() != 0) {
            result.set(UScript.getCodeFromName(item));
        }
    }
    return result;
}

Example #9

Source File: IdentifierInfo.java From fitnotifications with Apache License 2.0

5 votes

/**
 * Produce a readable string of a set of scripts
 * 
 * @param scripts a BitSet of UScript values
 * @return a readable string of a set of scripts
 * @internal
 * @deprecated This API is ICU internal only.
 */
@Deprecated
public static String displayScripts(BitSet scripts) {
    StringBuilder result = new StringBuilder();
    for (int i = scripts.nextSetBit(0); i >= 0; i = scripts.nextSetBit(i + 1)) {
        if (result.length() != 0) {
            result.append(' ');
        }
        result.append(UScript.getShortName(i));
    }
    return result.toString();
}

Example #10

Source File: SegmentationIcuTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

public void testTokenAttributes() throws Exception {
    Analyzer a = createAnalyzer();
    try (TokenStream ts = a.tokenStream("dummy", "This is a test")) {
        ScriptAttribute scriptAtt = ts.addAttribute(ScriptAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            assertEquals(UScript.LATIN, scriptAtt.getCode());
            assertEquals(UScript.getName(UScript.LATIN), scriptAtt.getName());
            assertEquals(UScript.getShortName(UScript.LATIN), scriptAtt.getShortName());
            assertTrue(ts.reflectAsString(false).contains("script=Latin"));
        }
        ts.end();
    }
    destroyAnalzyer(a);
}

Example #11

Source File: IdentifierInfo.java From fitnotifications with Apache License 2.0

5 votes

/**
 * Find the "tightest" restriction level that the identifier satisfies.
 * 
 * @return the restriction level.
 * @internal
 * @deprecated This API is ICU internal only.
 */
@Deprecated
public RestrictionLevel getRestrictionLevel() {
    if (!identifierProfile.containsAll(identifier) || getNumerics().size() > 1) {
        return RestrictionLevel.UNRESTRICTIVE;
    }
    if (ASCII.containsAll(identifier)) {
        return RestrictionLevel.ASCII;
    }
    // This is a bit tricky. We look at a number of factors.
    //   The number of scripts in the text.
    //   Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
    //   Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
    
    // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
    //       time it is created, in setIdentifier().
    final int cardinalityPlus = requiredScripts.cardinality() + (commonAmongAlternates.cardinality() == 0 ? scriptSetSet.size() : 1);
    if (cardinalityPlus < 2) {
        return RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE;
    }
    if (containsWithAlternates(JAPANESE, requiredScripts) || containsWithAlternates(CHINESE, requiredScripts)
            || containsWithAlternates(KOREAN, requiredScripts)) {
        return RestrictionLevel.HIGHLY_RESTRICTIVE;
    }
    if (cardinalityPlus == 2 && requiredScripts.get(UScript.LATIN) && !requiredScripts.intersects(CONFUSABLE_WITH_LATIN)) {
        return RestrictionLevel.MODERATELY_RESTRICTIVE;
    }
    return RestrictionLevel.MINIMALLY_RESTRICTIVE;
}

Example #12

Source File: AnyTransliterator.java From fitnotifications with Apache License 2.0

5 votes

/**
 * Return the script code for a given name, or
 * UScript.INVALID_CODE if not found.
 */
private static int scriptNameToCode(String name) {
    try{
        int[] codes = UScript.getCode(name);
        return codes != null ? codes[0] : UScript.INVALID_CODE;
    }catch( MissingResourceException e){
        ///CLOVER:OFF
        return UScript.INVALID_CODE;
        ///CLOVER:ON
    }
}

Example #13

Source File: SpoofChecker.java From fitnotifications with Apache License 2.0

5 votes

private void addScriptChars(ULocale locale, UnicodeSet allowedChars) {
    int scripts[] = UScript.getCode(locale);
    if (scripts != null) {
        UnicodeSet tmpSet = new UnicodeSet();
        for (int i = 0; i < scripts.length; i++) {
            tmpSet.applyIntPropertyValue(UProperty.SCRIPT, scripts[i]);
            allowedChars.addAll(tmpSet);
        }
    }
    // else it's an unknown script.
    // Maybe they asked for the script of "zxx", which refers to no linguistic content.
    // Maybe they asked for the script of a newer locale that we don't know in the older version of ICU.
}

Example #14

Source File: SpoofChecker.java From fitnotifications with Apache License 2.0

5 votes

/**
 * Limit characters that are acceptable in identifiers being checked to those normally used with the languages
 * associated with the specified locales. Any previously specified list of locales is replaced by the new
 * settings.
 *
 * A set of languages is determined from the locale(s), and from those a set of acceptable Unicode scripts is
 * determined. Characters from this set of scripts, along with characters from the "common" and "inherited"
 * Unicode Script categories will be permitted.
 *
 * Supplying an empty string removes all restrictions; characters from any script will be allowed.
 *
 * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker when calling this function with a
 * non-empty list of locales.
 *
 * The Unicode Set of characters that will be allowed is accessible via the {@link #getAllowedChars} function.
 * setAllowedLocales() will <i>replace</i> any previously applied set of allowed characters.
 *
 * Adjustments, such as additions or deletions of certain classes of characters, can be made to the result of
 * {@link #setAllowedChars} by fetching the resulting set with {@link #getAllowedChars}, manipulating it with
 * the Unicode Set API, then resetting the spoof detectors limits with {@link #setAllowedChars}.
 *
 * @param locales
 *            A Set of ULocales, from which the language and associated script are extracted. If the locales Set
 *            is null, no restrictions will be placed on the allowed characters.
 *
 * @return self
 * @stable ICU 4.6
 */
public Builder setAllowedLocales(Set<ULocale> locales) {
    fAllowedCharsSet.clear();

    for (ULocale locale : locales) {
        // Add the script chars for this locale to the accumulating set
        // of allowed chars.
        addScriptChars(locale, fAllowedCharsSet);
    }

    // If our caller provided an empty list of locales, we disable the
    // allowed characters checking
    fAllowedLocales.clear();
    if (locales.size() == 0) {
        fAllowedCharsSet.add(0, 0x10ffff);
        fChecks &= ~CHAR_LIMIT;
        return this;
    }

    // Add all common and inherited characters to the set of allowed
    // chars.
    UnicodeSet tempSet = new UnicodeSet();
    tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.COMMON);
    fAllowedCharsSet.addAll(tempSet);
    tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.INHERITED);
    fAllowedCharsSet.addAll(tempSet);

    // Store the updated spoof checker state.
    fAllowedLocales.clear();
    fAllowedLocales.addAll(locales);
    fChecks |= CHAR_LIMIT;
    return this;
}

Example #15

Source File: LaoBreakEngine.java From fitnotifications with Apache License 2.0

5 votes

public boolean handles(int c, int breakType) {
    if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
        int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
        return (script == UScript.LAO);
    }
    return false;
}

Example #16

Source File: UnicodeData.java From es6draft with MIT License

5 votes

@Override
public boolean has(int codePoint, int value) {
    // See https://ssl.icu-project.org/trac/ticket/13462
    switch (codePoint) {
    case 0x3000:
    case 0x3004:
    case 0x3012:
    case 0x3020:
    case 0x3036:
        return value == UScript.COMMON;
    }
    return super.has(codePoint, value);
}

Example #17

Source File: UnicodeData.java From es6draft with MIT License

5 votes

@Override
public boolean has(int codePoint, int value) {
    // See https://ssl.icu-project.org/trac/ticket/13462
    switch (codePoint) {
    case 0x3000:
    case 0x3004:
    case 0x3012:
    case 0x3020:
    case 0x3036:
        return value == UScript.COMMON;
    }
    return UScript.hasScript(codePoint, value);
}

Example #18

Source File: DefaultIcuTokenizerConfig.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

@Override
public String getType(int script, int ruleStatus) {
    switch (ruleStatus) {
        case RuleBasedBreakIterator.WORD_IDEO:
            return WORD_IDEO;
        case RuleBasedBreakIterator.WORD_KANA:
            return script == UScript.HIRAGANA ? WORD_HIRAGANA : WORD_KATAKANA;
        case RuleBasedBreakIterator.WORD_LETTER:
            return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
        case RuleBasedBreakIterator.WORD_NUMBER:
            return WORD_NUMBER;
        default: /* some other custom code */
            return "<OTHER>";
    }
}

Example #19

Source File: ULocale.java From fitnotifications with Apache License 2.0

5 votes

/**
 * {@icu} Returns whether this locale's script is written right-to-left.
 * If there is no script subtag, then the likely script is used,
 * see {@link #addLikelySubtags(ULocale)}.
 * If no likely script is known, then false is returned.
 *
 * <p>A script is right-to-left according to the CLDR script metadata
 * which corresponds to whether the script's letters have Bidi_Class=R or AL.
 *
 * <p>Returns true for "ar" and "en-Hebr", false for "zh" and "fa-Cyrl".
 *
 * @return true if the locale's script is written right-to-left
 * @stable ICU 54
 */
public boolean isRightToLeft() {
    String script = getScript();
    if (script.length() == 0) {
        // Fastpath: We know the likely scripts and their writing direction
        // for some common languages.
        String lang = getLanguage();
        if (lang.length() == 0) {
            return false;
        }
        int langIndex = LANG_DIR_STRING.indexOf(lang);
        if (langIndex >= 0) {
            switch (LANG_DIR_STRING.charAt(langIndex + lang.length())) {
            case '-': return false;
            case '+': return true;
            default: break;  // partial match of a longer code
            }
        }
        // Otherwise, find the likely script.
        ULocale likely = addLikelySubtags(this);
        script = likely.getScript();
        if (script.length() == 0) {
            return false;
        }
    }
    int scriptCode = UScript.getCodeFromName(script);
    return UScript.isRightToLeft(scriptCode);
}

Example #20

Source File: DefaultIcuTokenizerConfig.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

@Override
public BreakIterator getBreakIterator(int script) {
    switch (script) {
        case UScript.MYANMAR:
            if (myanmarAsWords) {
                return (BreakIterator) defaultBreakIterator.clone();
            } else {
                return (BreakIterator) myanmarSyllableIterator.clone();
            }
        case UScript.JAPANESE:
            return (BreakIterator) cjkBreakIterator.clone();
        default:
            return (BreakIterator) defaultBreakIterator.clone();
    }
}

Example #21

Source File: MCRLanguageDetector.java From mycore with GNU General Public License v3.0

5 votes

private static void buildScores(String text, Map<Integer, AtomicInteger> scores) {
    try {
        char[] chararray = text.toCharArray();
        for (int i = 0; i < text.length(); i++) {
            int code = UScript.getScript(UCharacter.codePointAt(chararray, i));
            increaseScoreFor(scores, code);
        }
    } catch (Exception ignored) {
    }
}

Example #22

Source File: ScriptAttributeImpl.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void reflectWith(AttributeReflector reflector) {
  // when wordbreaking CJK, we use the 15924 code Japanese (Han+Hiragana+Katakana) to 
  // mark runs of Chinese/Japanese. our use is correct (as for chinese Han is a subset), 
  // but this is just to help prevent confusion.
  String name = code == UScript.JAPANESE ? "Chinese/Japanese" : getName();
  reflector.reflect(ScriptAttribute.class, "script", name);
}

Example #23

Source File: CompositeBreakIterator.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Set a new region of text to be examined by this iterator
 * 
 * @param text buffer of text
 * @param start offset into buffer
 * @param length maximum length to examine
 */
void setText(final char text[], int start, int length) {
  this.text = text;
  scriptIterator.setText(text, start, length);
  if (scriptIterator.next()) {
    rbbi = getBreakIterator(scriptIterator.getScriptCode());
    rbbi.setText(text, scriptIterator.getScriptStart(), 
        scriptIterator.getScriptLimit() - scriptIterator.getScriptStart());
  } else {
    rbbi = getBreakIterator(UScript.COMMON);
    rbbi.setText(text, 0, 0);
  }
}

Example #24

Source File: DefaultICUTokenizerConfig.java From lucene-solr with Apache License 2.0

5 votes

@Override
public RuleBasedBreakIterator getBreakIterator(int script) {
  switch(script) {
    case UScript.JAPANESE: return (RuleBasedBreakIterator)cjkBreakIterator.clone();
    case UScript.MYANMAR: 
      if (myanmarAsWords) {
        return (RuleBasedBreakIterator)defaultBreakIterator.clone();
      } else {
        return (RuleBasedBreakIterator)myanmarSyllableIterator.clone();
      }
    default: return (RuleBasedBreakIterator)defaultBreakIterator.clone();
  }
}

Example #25

Source File: ScriptIterator.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Iterates to the next script run, returning true if one exists.
 * 
 * @return true if there is another script run, false otherwise.
 */
boolean next() {
  if (scriptLimit >= limit)
    return false;

  scriptCode = UScript.COMMON;
  scriptStart = scriptLimit;

  while (index < limit) {
    final int ch = UTF16.charAt(text, start, limit, index - start);
    final int sc = getScript(ch);

    /*
     * From UTR #24: Implementations that determine the boundaries between
     * characters of given scripts should never break between a non-spacing
     * mark and its base character. Thus for boundary determinations and
     * similar sorts of processing, a non-spacing mark — whatever its script
     * value — should inherit the script value of its base character.
     */
    if (isSameScript(scriptCode, sc)
        || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
      index += UTF16.getCharCount(ch);

      /*
       * Inherited or Common becomes the script code of the surrounding text.
       */
      if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
        scriptCode = sc;
      }

    } else {
      break;
    }
  }

  scriptLimit = index;
  return true;
}

Example #26

Source File: ScriptIterator.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Set a new region of text to be examined by this iterator
 * 
 * @param text text buffer to examine
 * @param start offset into buffer
 * @param length maximum length to examine
 */
void setText(char text[], int start, int length) {
  this.text = text;
  this.start = start;
  this.index = start;
  this.limit = start + length;
  this.scriptStart = start;
  this.scriptLimit = start;
  this.scriptCode = UScript.INVALID_CODE;
}

Example #27

Source File: TestICUTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testTokenAttributes() throws Exception {
  try (TokenStream ts = a.tokenStream("dummy", "This is a test")) {
    ScriptAttribute scriptAtt = ts.addAttribute(ScriptAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
      assertEquals(UScript.LATIN, scriptAtt.getCode());
      assertEquals(UScript.getName(UScript.LATIN), scriptAtt.getName());
      assertEquals(UScript.getShortName(UScript.LATIN), scriptAtt.getShortName());
      assertTrue(ts.reflectAsString(false).contains("script=Latin"));
    }
    ts.end();
  }
}

Example #28

Source File: ULocale.java From trekarta with GNU General Public License v3.0

5 votes

/**
 * {@icu} Returns whether this locale's script is written right-to-left.
 * If there is no script subtag, then the likely script is used,
 * see {@link #addLikelySubtags(ULocale)}.
 * If no likely script is known, then false is returned.
 *
 * <p>A script is right-to-left according to the CLDR script metadata
 * which corresponds to whether the script's letters have Bidi_Class=R or AL.
 *
 * <p>Returns true for "ar" and "en-Hebr", false for "zh" and "fa-Cyrl".
 *
 * @return true if the locale's script is written right-to-left
 * @stable ICU 54
 */
public boolean isRightToLeft() {
    String script = getScript();
    if (script.length() == 0) {
        // Fastpath: We know the likely scripts and their writing direction
        // for some common languages.
        String lang = getLanguage();
        if (!lang.isEmpty()) {
            int langIndex = LANG_DIR_STRING.indexOf(lang);
            if (langIndex >= 0) {
                switch (LANG_DIR_STRING.charAt(langIndex + lang.length())) {
                case '-': return false;
                case '+': return true;
                default: break;  // partial match of a longer code
                }
            }
        }
        // Otherwise, find the likely script.
        ULocale likely = addLikelySubtags(this);
        script = likely.getScript();
        if (script.length() == 0) {
            return false;
        }
    }
    int scriptCode = UScript.getCodeFromName(script);
    return UScript.isRightToLeft(scriptCode);
}

Example #29

Source File: SortingTestCase.java From vespa with Apache License 2.0

5 votes

private void requireThatArabicHasCorrectRules(Collator col) {
    final int reorderCodes [] = {UScript.ARABIC};
    assertEquals("6.2.0.0", col.getUCAVersion().toString());
    assertEquals("58.0.0.6", col.getVersion().toString());
    assertEquals(Arrays.toString(reorderCodes), Arrays.toString(col.getReorderCodes()));
    assertTrue(col.compare("a", "b") < 0);
    assertTrue(col.compare("a", "aس") < 0);
    assertFalse(col.compare("س", "a") < 0);

    assertEquals(" [reorder Arab]&ت<<ة<<<ﺔ<<<ﺓ&ي<<ى<<<ﯨ<<<ﯩ<<<ﻰ<<<ﻯ<<<ﲐ<<<ﱝ", ((RuleBasedCollator) col).getRules());
    assertFalse(col.compare("س", "a") < 0);
}

Example #30

Source File: SortingTestCase.java From vespa with Apache License 2.0

5 votes

private void requireThatChineseHasCorrectRules(Collator col) {
    final int reorderCodes [] = {UScript.HAN};
    assertEquals("8.0.0.0", col.getUCAVersion().toString());
    assertEquals("153.64.29.0", col.getVersion().toString());
    assertEquals(Arrays.toString(reorderCodes), Arrays.toString(col.getReorderCodes()));

    assertNotEquals("", ((RuleBasedCollator) col).getRules());
}