Java Code Examples for com.ibm.icu.lang.UScript#HAN

The following examples show how to use com.ibm.icu.lang.UScript#HAN . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ScriptIterator.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** fast version of UScript.getScript(). Basic Latin is an array lookup */
private int getScript(int codepoint) {
  if (0 <= codepoint && codepoint < basicLatin.length) {
    return basicLatin[codepoint];
  } else {
    int script = UScript.getScript(codepoint);
    if (combineCJ) {
      if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
        return UScript.JAPANESE;
      } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
        // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
        // they are treated as punctuation. we currently have no cleaner way to fix this!
        return UScript.LATIN; 
      } else {
        return script;
      }
    } else {
      return script;
    }
  }
}
 
Example 2
Source File: ScriptIterator.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * fast version of UScript.getScript(). Basic Latin is an array lookup
 */
private int getScript(int codepoint) {
    if (0 <= codepoint && codepoint < basicLatin.length) {
        return basicLatin[codepoint];
    } else {
        int script = UScript.getScript(codepoint);
        if (combineCJ) {
            if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
                return UScript.JAPANESE;
            } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
                // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
                // they are treated as punctuation. we currently have no cleaner way to fix this!
                return UScript.LATIN;
            } else {
                return script;
            }
        } else {
            return script;
        }
    }
}
 
Example 3
Source File: SortingTestCase.java    From vespa with Apache License 2.0 5 votes vote down vote up
private void requireThatChineseHasCorrectRules(Collator col) {
    final int reorderCodes [] = {UScript.HAN};
    assertEquals("8.0.0.0", col.getUCAVersion().toString());
    assertEquals("153.64.29.0", col.getVersion().toString());
    assertEquals(Arrays.toString(reorderCodes), Arrays.toString(col.getReorderCodes()));

    assertNotEquals("", ((RuleBasedCollator) col).getRules());
}
 
Example 4
Source File: RuleBasedBreakIterator.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
private LanguageBreakEngine getLanguageBreakEngine(int c) {

        // We have a dictionary character.
        // Does an already instantiated break engine handle it?
        for (LanguageBreakEngine candidate : fBreakEngines.values()) {
            if (candidate.handles(c, fBreakType)) {
                return candidate;
            }
        }

        // if we don't have an existing engine, build one.
        int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
        if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
            // Katakana, Hiragana and Han are handled by the same dictionary engine.
            // Fold them together for mapping from script -> engine.
            script = UScript.HAN;
        }

        LanguageBreakEngine eng = fBreakEngines.get(script);
        /*
        if (eng != null && !eng.handles(c, fBreakType)) {
            fUnhandledBreakEngine.handleChar(c, getBreakType());
            eng = fUnhandledBreakEngine;
        } else  */  {
            try {
                switch (script) {
                case UScript.THAI:
                    eng = new ThaiBreakEngine();
                    break;
                case UScript.LAO:
                    eng = new LaoBreakEngine();
                    break;
                case UScript.MYANMAR:
                    eng = new BurmeseBreakEngine();
                    break;
                case UScript.KHMER:
                    eng = new KhmerBreakEngine();
                    break;
                case UScript.HAN:
                    if (getBreakType() == KIND_WORD) {
                        eng = new CjkBreakEngine(false);
                    }
                    else {
                        fUnhandledBreakEngine.handleChar(c, getBreakType());
                        eng = fUnhandledBreakEngine;
                    }
                    break;
                case UScript.HANGUL:
                    if (getBreakType() == KIND_WORD) {
                        eng = new CjkBreakEngine(true);
                    } else {
                        fUnhandledBreakEngine.handleChar(c, getBreakType());
                        eng = fUnhandledBreakEngine;
                    }
                    break;
                default:
                    fUnhandledBreakEngine.handleChar(c, getBreakType());
                    eng = fUnhandledBreakEngine;
                    break;
                }
            } catch (IOException e) {
                eng = null;
            }
        }

        if (eng != null && eng != fUnhandledBreakEngine) {
            LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng);
            if (existingEngine != null) {
                // There was a race & another thread was first to register an engine for this script.
                // Use theirs and discard the one we just created.
                eng = existingEngine;
            }
            // assert eng.handles(c, fBreakType);
        }
        return eng;
    }
 
Example 5
Source File: UTS46.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
private void
checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) {
    int labelEnd=labelStart+labelLength-1;  // inclusive
    int arabicDigits=0;  // -1 for 066x, +1 for 06Fx
    for(int i=labelStart; i<=labelEnd; ++i) {
        int c=label.charAt(i);
        if(c<0xb7) {
            // ASCII fastpath
        } else if(c<=0x6f9) {
            if(c==0xb7) {
                // Appendix A.3. MIDDLE DOT (U+00B7)
                // Rule Set:
                //  False;
                //  If Before(cp) .eq.  U+006C And
                //     After(cp) .eq.  U+006C Then True;
                if(!(labelStart<i && label.charAt(i-1)=='l' &&
                     i<labelEnd && label.charAt(i+1)=='l')) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x375) {
                // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
                // Rule Set:
                //  False;
                //  If Script(After(cp)) .eq.  Greek Then True;
                if(!(i<labelEnd &&
                     UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x5f3 || c==0x5f4) {
                // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                //
                // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                if(!(labelStart<i &&
                     UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(0x660<=c /* && c<=0x6f9 */) {
                // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 06F0..06F9 Then False;
                //  End For;
                //
                // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 0660..0669 Then False;
                //  End For;
                if(c<=0x669) {
                    if(arabicDigits>0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=-1;
                } else if(0x6f0<=c) {
                    if(arabicDigits<0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=1;
                }
            }
        } else if(c==0x30fb) {
            // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
            // Rule Set:
            //  False;
            //  For All Characters:
            //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
            //  End For;
            for(int j=labelStart;; j+=Character.charCount(c)) {
                if(j>labelEnd) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                    break;
                }
                c=Character.codePointAt(label, j);
                int script=UScript.getScript(c);
                if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) {
                    break;
                }
            }
        }
    }
}
 
Example 6
Source File: UTS46.java    From trekarta with GNU General Public License v3.0 4 votes vote down vote up
private void
checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) {
    int labelEnd=labelStart+labelLength-1;  // inclusive
    int arabicDigits=0;  // -1 for 066x, +1 for 06Fx
    for(int i=labelStart; i<=labelEnd; ++i) {
        int c=label.charAt(i);
        if(c<0xb7) {
            // ASCII fastpath
        } else if(c<=0x6f9) {
            if(c==0xb7) {
                // Appendix A.3. MIDDLE DOT (U+00B7)
                // Rule Set:
                //  False;
                //  If Before(cp) .eq.  U+006C And
                //     After(cp) .eq.  U+006C Then True;
                if(!(labelStart<i && label.charAt(i-1)=='l' &&
                     i<labelEnd && label.charAt(i+1)=='l')) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x375) {
                // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
                // Rule Set:
                //  False;
                //  If Script(After(cp)) .eq.  Greek Then True;
                if(!(i<labelEnd &&
                     UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x5f3 || c==0x5f4) {
                // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                //
                // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                if(!(labelStart<i &&
                     UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(0x660<=c /* && c<=0x6f9 */) {
                // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 06F0..06F9 Then False;
                //  End For;
                //
                // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 0660..0669 Then False;
                //  End For;
                if(c<=0x669) {
                    if(arabicDigits>0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=-1;
                } else if(0x6f0<=c) {
                    if(arabicDigits<0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=1;
                }
            }
        } else if(c==0x30fb) {
            // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
            // Rule Set:
            //  False;
            //  For All Characters:
            //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
            //  End For;
            for(int j=labelStart;; j+=Character.charCount(c)) {
                if(j>labelEnd) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                    break;
                }
                c=Character.codePointAt(label, j);
                int script=UScript.getScript(c);
                if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) {
                    break;
                }
            }
        }
    }
}
 
Example 7
Source File: AnyTransliterator.java    From fitnotifications with Apache License 2.0 2 votes vote down vote up
/**
 * @param targetScript2
 * @return
 */
private boolean isWide(int script) {
    return script == UScript.BOPOMOFO || script == UScript.HAN || script == UScript.HANGUL || script == UScript.HIRAGANA || script == UScript.KATAKANA;
}