Java Code Examples for com.ibm.icu.lang.UScript#getScript()

The following examples show how to use com.ibm.icu.lang.UScript#getScript() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ScriptIterator.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** fast version of UScript.getScript(). Basic Latin is an array lookup */
private int getScript(int codepoint) {
  if (0 <= codepoint && codepoint < basicLatin.length) {
    return basicLatin[codepoint];
  } else {
    int script = UScript.getScript(codepoint);
    if (combineCJ) {
      if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
        return UScript.JAPANESE;
      } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
        // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
        // they are treated as punctuation. we currently have no cleaner way to fix this!
        return UScript.LATIN; 
      } else {
        return script;
      }
    } else {
      return script;
    }
  }
}
 
Example 2
Source File: ScriptIterator.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * fast version of UScript.getScript(). Basic Latin is an array lookup
 */
private int getScript(int codepoint) {
    if (0 <= codepoint && codepoint < basicLatin.length) {
        return basicLatin[codepoint];
    } else {
        int script = UScript.getScript(codepoint);
        if (combineCJ) {
            if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
                return UScript.JAPANESE;
            } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
                // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
                // they are treated as punctuation. we currently have no cleaner way to fix this!
                return UScript.LATIN;
            } else {
                return script;
            }
        } else {
            return script;
        }
    }
}
 
Example 3
Source File: MCRLanguageDetector.java    From mycore with GNU General Public License v3.0 5 votes vote down vote up
private static void buildScores(String text, Map<Integer, AtomicInteger> scores) {
    try {
        char[] chararray = text.toCharArray();
        for (int i = 0; i < text.length(); i++) {
            int code = UScript.getScript(UCharacter.codePointAt(chararray, i));
            increaseScoreFor(scores, code);
        }
    } catch (Exception ignored) {
    }
}
 
Example 4
Source File: AnyTransliterator.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
/**
 * Returns TRUE if there are any more runs.  TRUE is always
 * returned at least once.  Upon return, the caller should
 * examine scriptCode, start, and limit.
 */
public boolean next() {
    int ch;
    int s;

    scriptCode = UScript.INVALID_CODE; // don't know script yet
    start = limit;

    // Are we done?
    if (start == textLimit) {
        return false;
    }

    // Move start back to include adjacent COMMON or INHERITED
    // characters
    while (start > textStart) {
        ch = text.char32At(start - 1); // look back
        s = UScript.getScript(ch);
        if (s == UScript.COMMON || s == UScript.INHERITED) {
            --start;
        } else {
            break;
        }
    }

    // Move limit ahead to include COMMON, INHERITED, and characters
    // of the current script.
    while (limit < textLimit) {
        ch = text.char32At(limit); // look ahead
        s = UScript.getScript(ch);
        if (s != UScript.COMMON && s != UScript.INHERITED) {
            if (scriptCode == UScript.INVALID_CODE) {
                scriptCode = s;
            } else if (s != scriptCode) {
                break;
            }
        }
        ++limit;
    }

    // Return TRUE even if the entire text is COMMON / INHERITED, in
    // which case scriptCode will be UScript.INVALID_CODE.
    return true;
}
 
Example 5
Source File: UTS46.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
private void
checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) {
    int labelEnd=labelStart+labelLength-1;  // inclusive
    int arabicDigits=0;  // -1 for 066x, +1 for 06Fx
    for(int i=labelStart; i<=labelEnd; ++i) {
        int c=label.charAt(i);
        if(c<0xb7) {
            // ASCII fastpath
        } else if(c<=0x6f9) {
            if(c==0xb7) {
                // Appendix A.3. MIDDLE DOT (U+00B7)
                // Rule Set:
                //  False;
                //  If Before(cp) .eq.  U+006C And
                //     After(cp) .eq.  U+006C Then True;
                if(!(labelStart<i && label.charAt(i-1)=='l' &&
                     i<labelEnd && label.charAt(i+1)=='l')) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x375) {
                // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
                // Rule Set:
                //  False;
                //  If Script(After(cp)) .eq.  Greek Then True;
                if(!(i<labelEnd &&
                     UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x5f3 || c==0x5f4) {
                // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                //
                // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                if(!(labelStart<i &&
                     UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(0x660<=c /* && c<=0x6f9 */) {
                // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 06F0..06F9 Then False;
                //  End For;
                //
                // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 0660..0669 Then False;
                //  End For;
                if(c<=0x669) {
                    if(arabicDigits>0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=-1;
                } else if(0x6f0<=c) {
                    if(arabicDigits<0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=1;
                }
            }
        } else if(c==0x30fb) {
            // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
            // Rule Set:
            //  False;
            //  For All Characters:
            //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
            //  End For;
            for(int j=labelStart;; j+=Character.charCount(c)) {
                if(j>labelEnd) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                    break;
                }
                c=Character.codePointAt(label, j);
                int script=UScript.getScript(c);
                if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) {
                    break;
                }
            }
        }
    }
}
 
Example 6
Source File: UCharacterProperty.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
@Override
int getValue(int c) {
    return UScript.getScript(c);
}
 
Example 7
Source File: CharScriptsSet.java    From jasperreports with GNU Lesser General Public License v3.0 4 votes vote down vote up
public boolean includesCharacter(int codePoint)
{
	if (includedScripts == null && excludedScripts == null)
	{
		return true;
	}
	
	int codeScript = UScript.getScript(codePoint);
	if (codeScript == UScript.UNKNOWN)
	{
		//include by default
		return true;
	}
	
	if (codeScript == UScript.COMMON)
	{
		//COMMON is included unless explicitly excluded
		return !excludedCommon;
	}
	
	if (codeScript == UScript.INHERITED)
	{
		//INHERITED is included unless explicitly excluded
		return !excludedInherited;
	}
	
	if (includedScripts != null && includedScripts.contains(codeScript))
	{
		//the codepoint script is explicitly included
		return true;
	}
	
	if (excludedScripts != null && excludedScripts.contains(codeScript))
	{
		//the codepoint script is explicitly excluded
		return false;
	}
	
	if (includedScripts == null)
	{
		//not excluded
		return true;
	}
	
	for (Integer script : includedScripts)
	{
		if (UScript.hasScript(codePoint, script))
		{
			//included as a secondary/extension script
			return true;
		}
	}
	
	//not included
	return false;
}
 
Example 8
Source File: UTS46.java    From trekarta with GNU General Public License v3.0 4 votes vote down vote up
private void
checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) {
    int labelEnd=labelStart+labelLength-1;  // inclusive
    int arabicDigits=0;  // -1 for 066x, +1 for 06Fx
    for(int i=labelStart; i<=labelEnd; ++i) {
        int c=label.charAt(i);
        if(c<0xb7) {
            // ASCII fastpath
        } else if(c<=0x6f9) {
            if(c==0xb7) {
                // Appendix A.3. MIDDLE DOT (U+00B7)
                // Rule Set:
                //  False;
                //  If Before(cp) .eq.  U+006C And
                //     After(cp) .eq.  U+006C Then True;
                if(!(labelStart<i && label.charAt(i-1)=='l' &&
                     i<labelEnd && label.charAt(i+1)=='l')) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x375) {
                // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
                // Rule Set:
                //  False;
                //  If Script(After(cp)) .eq.  Greek Then True;
                if(!(i<labelEnd &&
                     UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x5f3 || c==0x5f4) {
                // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                //
                // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                if(!(labelStart<i &&
                     UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(0x660<=c /* && c<=0x6f9 */) {
                // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 06F0..06F9 Then False;
                //  End For;
                //
                // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 0660..0669 Then False;
                //  End For;
                if(c<=0x669) {
                    if(arabicDigits>0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=-1;
                } else if(0x6f0<=c) {
                    if(arabicDigits<0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=1;
                }
            }
        } else if(c==0x30fb) {
            // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
            // Rule Set:
            //  False;
            //  For All Characters:
            //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
            //  End For;
            for(int j=labelStart;; j+=Character.charCount(c)) {
                if(j>labelEnd) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                    break;
                }
                c=Character.codePointAt(label, j);
                int script=UScript.getScript(c);
                if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) {
                    break;
                }
            }
        }
    }
}
 
Example 9
Source File: UCharacterProperty.java    From trekarta with GNU General Public License v3.0 4 votes vote down vote up
@Override
int getValue(int c) {
    return UScript.getScript(c);
}