com.ibm.icu.lang.UCharacter Java Examples
The following examples show how to use
com.ibm.icu.lang.UCharacter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: UCharacterName.java From trekarta with GNU General Public License v3.0 | 6 votes |
/** * Gets the character extended type * @param ch character to be tested * @return extended type it is associated with */ private static int getType(int ch) { if (UCharacterUtility.isNonCharacter(ch)) { // not a character we return a invalid category count return NON_CHARACTER_; } int result = UCharacter.getType(ch); if (result == UCharacterCategory.SURROGATE) { if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { result = LEAD_SURROGATE_; } else { result = TRAIL_SURROGATE_; } } return result; }
Example #2
Source File: SpoofChecker.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Computes the set of numerics for a string, according to UTS 39 section 5.3. */ private void getNumerics(String input, UnicodeSet result) { result.clear(); for (int utf16Offset = 0; utf16Offset < input.length();) { int codePoint = Character.codePointAt(input, utf16Offset); utf16Offset += Character.charCount(codePoint); // Store a representative character for each kind of decimal digit if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) { // Store the zero character as a representative for comparison. // Unicode guarantees it is codePoint - value result.add(codePoint - UCharacter.getNumericValue(codePoint)); } } }
Example #3
Source File: CollationRuleParser.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Gets a script or reorder code from its string representation. * @return the script/reorder code, or * -1 if not recognized */ public static int getReorderCode(String word) { for(int i = 0; i < gSpecialReorderCodes.length; ++i) { if(word.equalsIgnoreCase(gSpecialReorderCodes[i])) { return Collator.ReorderCodes.FIRST + i; } } try { int script = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, word); if(script >= 0) { return script; } } catch (IllegalIcuArgumentException e) { // fall through } if(word.equalsIgnoreCase("others")) { return Collator.ReorderCodes.OTHERS; // same as Zzzz = USCRIPT_UNKNOWN } return -1; }
Example #4
Source File: RuleBasedNumberFormat.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Adjust capitalization of formatted result for display context */ private String adjustForContext(String result) { if (result != null && result.length() > 0 && UCharacter.isLowerCase(result.codePointAt(0))) { DisplayContext capitalization = getContext(DisplayContext.Type.CAPITALIZATION); if ( capitalization==DisplayContext.CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE || (capitalization == DisplayContext.CAPITALIZATION_FOR_UI_LIST_OR_MENU && capitalizationForListOrMenu) || (capitalization == DisplayContext.CAPITALIZATION_FOR_STANDALONE && capitalizationForStandAlone) ) { if (capitalizationBrkIter == null) { // should only happen when deserializing, etc. capitalizationBrkIter = BreakIterator.getSentenceInstance(locale); } return UCharacter.toTitleCase(locale, result, capitalizationBrkIter, UCharacter.TITLECASE_NO_LOWERCASE | UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT); } } return result; }
Example #5
Source File: Utility.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Parse a Unicode identifier from the given string at the given * position. Return the identifier, or null if there is no * identifier. * @param str the string to parse * @param pos INPUT-OUPUT parameter. On INPUT, pos[0] is the * first character to examine. It must be less than str.length(), * and it must not point to a whitespace character. That is, must * have pos[0] < str.length(). On * OUTPUT, the position after the last parsed character. * @return the Unicode identifier, or null if there is no valid * identifier at pos[0]. */ public static String parseUnicodeIdentifier(String str, int[] pos) { // assert(pos[0] < str.length()); StringBuilder buf = new StringBuilder(); int p = pos[0]; while (p < str.length()) { int ch = Character.codePointAt(str, p); if (buf.length() == 0) { if (UCharacter.isUnicodeIdentifierStart(ch)) { buf.appendCodePoint(ch); } else { return null; } } else { if (UCharacter.isUnicodeIdentifierPart(ch)) { buf.appendCodePoint(ch); } else { break; } } p += UTF16.getCharCount(ch); } pos[0] = p; return buf.toString(); }
Example #6
Source File: BreakIteratorWrapper.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
private int calcStatus(int current, int next) { if (current == BreakIterator.DONE || next == BreakIterator.DONE) { return RuleBasedBreakIterator.WORD_NONE; } int begin = start + current; int end = start + next; int codepoint; for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) { codepoint = UTF16.charAt(text, 0, end, begin); if (UCharacter.isDigit(codepoint)) { return RuleBasedBreakIterator.WORD_NUMBER; } else if (UCharacter.isLetter(codepoint)) { return RuleBasedBreakIterator.WORD_LETTER; } } return RuleBasedBreakIterator.WORD_NONE; }
Example #7
Source File: UCharacterName.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Retrieve the name of a Unicode code point. * Depending on <code>choice</code>, the character name written into the * buffer is the "modern" name or the name that was defined in Unicode * version 1.0. * The name contains only "invariant" characters * like A-Z, 0-9, space, and '-'. * * @param ch the code point for which to get the name. * @param choice Selector for which name to get. * @return if code point is above 0x1fff, null is returned */ public String getName(int ch, int choice) { if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE || choice > UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT) { return null; } String result = null; result = getAlgName(ch, choice); // getting normal character name if (result == null || result.length() == 0) { if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) { result = getExtendedName(ch); } else { result = getGroupName(ch, choice); } } return result; }
Example #8
Source File: Locale.java From tcl-regex-java with Apache License 2.0 | 6 votes |
static int element(String what) throws RegexException { // this is a single character or the name of a character. // Surrogate pairs? we can't deal yet. This function // returns 'int' but no one upstairs is home yet. if (what.length() == 1) { return what.charAt(0); } if (CNAME.containsKey(what)) { return CNAME.get(what); } int uc = UCharacter.getCharFromName(what); // what if someone names a non-BMP char? if (uc != -1) { if (uc > 0xffff) { throw new RegexException(String.format( "Limitation: cannot handle equivalence outside of the BMP: %s not possible.", what)); } return uc; } return -1; }
Example #9
Source File: RBBIRuleScanner.java From fitnotifications with Apache License 2.0 | 6 votes |
static String stripRules(String rules) { StringBuilder strippedRules = new StringBuilder(); int rulesLength = rules.length(); for (int idx = 0; idx < rulesLength;) { char ch = rules.charAt(idx++); if (ch == '#') { while (idx < rulesLength && ch != '\r' && ch != '\n' && ch != chNEL) { ch = rules.charAt(idx++); } } if (!UCharacter.isISOControl(ch)) { strippedRules.append(ch); } } return strippedRules.toString(); }
Example #10
Source File: UCharacterName.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Gets the character extended type * @param ch character to be tested * @return extended type it is associated with */ private static int getType(int ch) { if (UCharacterUtility.isNonCharacter(ch)) { // not a character we return a invalid category count return NON_CHARACTER_; } int result = UCharacter.getType(ch); if (result == UCharacterCategory.SURROGATE) { if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { result = LEAD_SURROGATE_; } else { result = TRAIL_SURROGATE_; } } return result; }
Example #11
Source File: AlphabeticIndex.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Return the string with interspersed CGJs. Input must have more than 2 codepoints. * <p>This is used to test whether contractions sort differently from their components. */ private String separated(String item) { StringBuilder result = new StringBuilder(); // add a CGJ except within surrogates char last = item.charAt(0); result.append(last); for (int i = 1; i < item.length(); ++i) { char ch = item.charAt(i); if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) { result.append(CGJ); } result.append(ch); last = ch; } return result.toString(); }
Example #12
Source File: UnicodeDataTest.java From es6draft with MIT License | 6 votes |
@SuppressWarnings("deprecation") @Test public void testAllICUBinaryProperties() { for (int p = UProperty.BINARY_START; p < UProperty.BINARY_LIMIT; ++p) { String shortName = UCharacter.getPropertyName(p, UProperty.NameChoice.SHORT); if (shortName != null) { // Does not throw. isBinaryProperty(shortName); } String longName = UCharacter.getPropertyName(p, UProperty.NameChoice.LONG); if (longName != null) { // Does not throw. isBinaryProperty(longName); } } }
Example #13
Source File: TransliteratorParser.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Implement SymbolTable API. Parse out a symbol reference * name. */ @Override public String parseReference(String text, ParsePosition pos, int limit) { int start = pos.getIndex(); int i = start; while (i < limit) { char c = text.charAt(i); if ((i==start && !UCharacter.isUnicodeIdentifierStart(c)) || !UCharacter.isUnicodeIdentifierPart(c)) { break; } ++i; } if (i == start) { // No valid name chars return null; } pos.setIndex(i); return text.substring(start, i); }
Example #14
Source File: Trie.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Internal trie getter from a code point. * Could be faster(?) but longer with * if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); } * Gets the offset to data which the codepoint points to * @param ch codepoint * @return offset to data */ protected final int getCodePointOffset(int ch) { // if ((ch >> 16) == 0) slower if (ch < 0) { return -1; } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works return getRawOffset(0, (char)ch); } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) { // BMP codepoint return getBMPOffset((char)ch); } else if (ch <= UCharacter.MAX_VALUE) { // look at the construction of supplementary characters // trail forms the ends of it. return getSurrogateOffset(UTF16.getLeadSurrogate(ch), (char)(ch & SURROGATE_MASK_)); } else { // return -1 if there is an error, in this case we return return -1; } }
Example #15
Source File: UCharacterName.java From trekarta with GNU General Public License v3.0 | 6 votes |
/** * Retrieve the name of a Unicode code point. * Depending on <code>choice</code>, the character name written into the * buffer is the "modern" name or the name that was defined in Unicode * version 1.0. * The name contains only "invariant" characters * like A-Z, 0-9, space, and '-'. * * @param ch the code point for which to get the name. * @param choice Selector for which name to get. * @return if code point is above 0x1fff, null is returned */ public String getName(int ch, int choice) { if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE || choice > UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT) { return null; } String result = null; result = getAlgName(ch, choice); // getting normal character name if (result == null || result.length() == 0) { if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) { result = getExtendedName(ch); } else { result = getGroupName(ch, choice); } } return result; }
Example #16
Source File: IntTrieBuilder.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Sets a 32 bit data in the table data * @param ch codepoint which data is to be set * @param value to set * @return true if the set is successful, otherwise * if the table has been compacted return false */ public boolean setValue(int ch, int value) { // valid, uncompacted trie and valid c? if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) { return false; } int block = getDataBlock(ch); if (block < 0) { return false; } m_data_[block + (ch & MASK_)] = value; return true; }
Example #17
Source File: UTR30DataFileGenerator.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
private static void expandSingleRule(StringBuilder builder, String leftHandSide, String rightHandSide) { UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE); boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches(); for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.nextRange(); ) { if (it.codepoint != UnicodeSetIterator.IS_STRING) { if (numericValue) { for (int cp = it.codepoint; cp <= it.codepointEnd; ++cp) { builder.append(String.format(Locale.ROOT, "%04X", cp)).append('>'); builder.append(String.format(Locale.ROOT, "%04X", 0x30 + UCharacter.getNumericValue(cp))); builder.append(" # ").append(UCharacter.getName(cp)); builder.append("\n"); } } else { builder.append(String.format(Locale.ROOT, "%04X", it.codepoint)); if (it.codepointEnd > it.codepoint) { builder.append("..").append(String.format(Locale.ROOT, "%04X", it.codepointEnd)); } builder.append('>').append(rightHandSide).append("\n"); } } else { logger.error("ERROR: String '" + it.getString() + "' found in UnicodeSet"); } } }
Example #18
Source File: IntTrieBuilder.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Get a 32 bit data from the table data * @param ch code point for which data is to be retrieved. * @param inBlockZero Output parameter, inBlockZero[0] returns true if the * char maps into block zero, otherwise false. * @return the 32 bit data value. */ public int getValue(int ch, boolean [] inBlockZero) { // valid, uncompacted trie and valid c? if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) { if (inBlockZero != null) { inBlockZero[0] = true; } return 0; } int block = m_index_[ch >> SHIFT_]; if (inBlockZero != null) { inBlockZero[0] = (block == 0); } return m_data_[Math.abs(block) + (ch & MASK_)]; }
Example #19
Source File: UCharacterName.java From trekarta with GNU General Public License v3.0 | 5 votes |
/** * Sets the information for accessing the algorithmic names * @param rangestart starting code point that lies within this name group * @param rangeend end code point that lies within this name group * @param type algorithm type. There's 2 kinds of algorithmic type. First * which uses code point as part of its name and the other uses * variant postfix strings * @param variant algorithmic variant * @return true if values are valid */ boolean setInfo(int rangestart, int rangeend, byte type, byte variant) { if (rangestart >= UCharacter.MIN_VALUE && rangestart <= rangeend && rangeend <= UCharacter.MAX_VALUE && (type == TYPE_0_ || type == TYPE_1_)) { m_rangestart_ = rangestart; m_rangeend_ = rangeend; m_type_ = type; m_variant_ = variant; return true; } return false; }
Example #20
Source File: StringFormatSpecifierImpl.java From birt with Eclipse Public License 1.0 | 5 votes |
/** * @param val * string to be handled * @param option * to upper case or to lower case * @return */ private String handleCase( String val, char option, ULocale locale ) { if ( option == '<' ) return UCharacter.toLowerCase( locale, val ); else if ( option == '>' ) return UCharacter.toUpperCase( locale, val ); else return val; }
Example #21
Source File: UnicodeDataTest.java From es6draft with MIT License | 5 votes |
@SuppressWarnings("deprecation") @Test public void testLimits() { // integer valued properties for (int p = UProperty.INT_START; p < UProperty.INT_LIMIT; ++p) { int min = UCharacter.getIntPropertyMinValue(p); int max = UCharacter.getIntPropertyMaxValue(p); assertTrue(String.format("min=%d", min), min >= 0); assertTrue(String.format("min=%d, max=%d", min, max), min <= max); assertTrue(String.format("max=%d", max), max < 512); // BINARY_MASK in UEncoding } }
Example #22
Source File: GenerateUTR30DataFiles.java From lucene-solr with Apache License 2.0 | 5 votes |
private static void expandSingleRule (StringBuilder builder, String leftHandSide, String rightHandSide) throws IllegalArgumentException { UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE); boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches(); for (UnicodeSetIterator it = new UnicodeSetIterator(set) ; it.nextRange() ; ) { if (it.codepoint != UnicodeSetIterator.IS_STRING) { if (numericValue) { for (int cp = it.codepoint ; cp <= it.codepointEnd ; ++cp) { builder.append(String.format(Locale.ROOT, "%04X", cp)).append('>'); builder.append(String.format(Locale.ROOT, "%04X", 0x30 + UCharacter.getNumericValue(cp))); builder.append(" # ").append(UCharacter.getName(cp)); builder.append("\n"); } } else { builder.append(String.format(Locale.ROOT, "%04X", it.codepoint)); if (it.codepointEnd > it.codepoint) { builder.append("..").append(String.format(Locale.ROOT, "%04X", it.codepointEnd)); } builder.append('>').append(rightHandSide).append("\n"); } } else { System.err.println("ERROR: String '" + it.getString() + "' found in UnicodeSet"); System.exit(1); } } }
Example #23
Source File: MCRLanguageDetector.java From mycore with GNU General Public License v3.0 | 5 votes |
private static void buildScores(String text, Map<Integer, AtomicInteger> scores) { try { char[] chararray = text.toCharArray(); for (int i = 0; i < text.length(); i++) { int code = UScript.getScript(UCharacter.codePointAt(chararray, i)); increaseScoreFor(scores, code); } } catch (Exception ignored) { } }
Example #24
Source File: TrieBuilder.java From fitnotifications with Apache License 2.0 | 5 votes |
/** * Checks if the character belongs to a zero block in the trie * @param ch codepoint which data is to be retrieved * @return true if ch is in the zero block */ public boolean isInZeroBlock(int ch) { // valid, uncompacted trie and valid c? if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < UCharacter.MIN_VALUE) { return true; } return m_index_[ch >> SHIFT_] == 0; }
Example #25
Source File: Utility.java From fitnotifications with Apache License 2.0 | 5 votes |
/** * Parse an unsigned 31-bit integer at the given offset. Use * UCharacter.digit() to parse individual characters into digits. * @param text the text to be parsed * @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the * offset within text at which to start parsing; it should point * to a valid digit. On exit, pos[0] is the offset after the last * parsed character. If the parse failed, it will be unchanged on * exit. Must be >= 0 on entry. * @param radix the radix in which to parse; must be >= 2 and <= * 36. * @return a non-negative parsed number, or -1 upon parse failure. * Parse fails if there are no digits, that is, if pos[0] does not * point to a valid digit on entry, or if the number to be parsed * does not fit into a 31-bit unsigned integer. */ public static int parseNumber(String text, int[] pos, int radix) { // assert(pos[0] >= 0); // assert(radix >= 2); // assert(radix <= 36); int n = 0; int p = pos[0]; while (p < text.length()) { int ch = Character.codePointAt(text, p); int d = UCharacter.digit(ch, radix); if (d < 0) { break; } n = radix*n + d; // ASSUME that when a 32-bit integer overflows it becomes // negative. E.g., 214748364 * 10 + 8 => negative value. if (n < 0) { return -1; } ++p; } if (p == pos[0]) { return -1; } pos[0] = p; return n; }
Example #26
Source File: Utility.java From fitnotifications with Apache License 2.0 | 5 votes |
/** * Parse an integer at pos, either of the form \d+ or of the form * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex, * or octal format. * @param pos INPUT-OUTPUT parameter. On input, the first * character to parse. On output, the character after the last * parsed character. */ public static int parseInteger(String rule, int[] pos, int limit) { int count = 0; int value = 0; int p = pos[0]; int radix = 10; if (rule.regionMatches(true, p, "0x", 0, 2)) { p += 2; radix = 16; } else if (p < limit && rule.charAt(p) == '0') { p++; count = 1; radix = 8; } while (p < limit) { int d = UCharacter.digit(rule.charAt(p++), radix); if (d < 0) { --p; break; } ++count; int v = (value * radix) + d; if (v <= value) { // If there are too many input digits, at some point // the value will go negative, e.g., if we have seen // "0x8000000" already and there is another '0', when // we parse the next 0 the value will go negative. return 0; } value = v; } if (count > 0) { pos[0] = p; } return value; }
Example #27
Source File: TrieIterator.java From fitnotifications with Apache License 2.0 | 5 votes |
/** * <p>Returns true if we are not at the end of the iteration, false * otherwise.</p> * <p>The next set of codepoints with the same value type will be * calculated during this call and returned in the arguement element.</p> * @param element return result * @return true if we are not at the end of the iteration, false otherwise. * @exception NoSuchElementException - if no more elements exist. * @see com.ibm.icu.util.RangeValueIterator.Element */ @Override public final boolean next(Element element) { if (m_nextCodepoint_ > UCharacter.MAX_VALUE) { return false; } if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE && calculateNextBMPElement(element)) { return true; } calculateNextSupplementaryElement(element); return true; }
Example #28
Source File: XMLRecordReader.java From fitnotifications with Apache License 2.0 | 5 votes |
private String readNextTag() { int c = '\0'; while (!atTag) { c = readChar(); if (c == '<' || c == -1) { if (c == '<') { atTag = true; } break; } if (!UCharacter.isWhitespace(c)) { System.err.println("Unexpected non-whitespace character " + Integer.toHexString(c)); break; } } if (atTag) { atTag = false; StringBuilder sb = new StringBuilder(); while (true) { c = readChar(); if (c == '>' || c == -1) { break; } sb.append((char) c); } // System.err.println("read tag: '" + sb.toString() + "'"); return sb.toString(); } return null; }
Example #29
Source File: IntTrieBuilder.java From fitnotifications with Apache License 2.0 | 5 votes |
/** * Gets a 32 bit data from the table data * @param ch codepoint which data is to be retrieved * @return the 32 bit data */ public int getValue(int ch) { // valid, uncompacted trie and valid c? if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) { return 0; } int block = m_index_[ch >> SHIFT_]; return m_data_[Math.abs(block) + (ch & MASK_)]; }
Example #30
Source File: StringPrep.java From trekarta with GNU General Public License v3.0 | 5 votes |
private StringPrep(ByteBuffer bytes) throws IOException { StringPrepDataReader reader = new StringPrepDataReader(bytes); // read the indexes indexes = reader.readIndexes(INDEX_TOP); sprepTrie = new CharTrie(bytes, null); //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes // load the rest of the data data and initialize the data members mappingData = reader.read(indexes[INDEX_MAPPING_DATA_SIZE]/2); // get the options doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0); checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0); sprepUniVer = getVersionInfo(reader.getUnicodeVersion()); normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]); VersionInfo normUniVer = UCharacter.getUnicodeVersion(); if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */ normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */ ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/ ){ throw new IOException("Normalization Correction version not supported"); } if(checkBiDi) { bdp=UBiDiProps.INSTANCE; } }