com.ibm.icu.text.Normalizer2 Java Examples
The following examples show how to use
com.ibm.icu.text.Normalizer2.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IcuNormalizerCharFilterFactory.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
protected Normalizer2.Mode getNormalizationMode(Settings settings) { Normalizer2.Mode normalizationMode; switch (settings.get("normalization_mode", "compose")) { case "compose_contiguous": normalizationMode = Normalizer2.Mode.COMPOSE_CONTIGUOUS; break; case "decompose": normalizationMode = Normalizer2.Mode.DECOMPOSE; break; case "fcd": normalizationMode = Normalizer2.Mode.FCD; break; default: normalizationMode = Normalizer2.Mode.COMPOSE; break; } return normalizationMode; }
Example #2
Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testMassiveLigature() throws IOException { String input = "\uFDFA"; CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false); tokenStream.setReader(reader); assertTokenStreamContents(tokenStream, new String[] {"صلى", "الله", "عليه", "وسلم"}, new int[]{0, 0, 0, 0}, new int[]{0, 0, 0, 1}, input.length() ); }
Example #3
Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testTokenStream2() throws IOException { // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<' String input = "㌰゙5℃№㈱㌘ザゾ"; CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), 1, 1); tokenStream.setReader(reader); assertTokenStreamContents(tokenStream, new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"}, new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9}, new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11}, input.length() ); }
Example #4
Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testTokenStream() throws IOException { // '℃', '№', '㈱', '㌘', 'サ'+'<<', 'ソ'+'<<', '㌰'+'<<' String input = "℃ № ㈱ ㌘ ザ ゾ ㌰゙"; CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE)); Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false); tokenStream.setReader(reader); assertTokenStreamContents(tokenStream, new String[] {"°C", "No", "(株)", "グラム", "ザ", "ゾ", "ピゴ"}, new int[] {0, 2, 4, 6, 8, 11, 14}, new int[] {1, 3, 5, 7, 10, 13, 16}, input.length()); }
Example #5
Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testNormalization() throws IOException { String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि"; Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE); String expectedOutput = normalizer.normalize(input); CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer); char[] tempBuff = new char[10]; StringBuilder output = new StringBuilder(); while (true) { int length = reader.read(tempBuff); if (length == -1) { break; } output.append(tempBuff, 0, length); assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length())))); } assertEquals(expectedOutput, output.toString()); }
Example #6
Source File: TestICUNormalizer2Filter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testAlternate() throws IOException { Analyzer a = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter( tokenizer, /* specify nfc with decompose to get nfd */ Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE))); } }; // decompose EAcute into E + combining Acute assertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" }); a.close(); }
Example #7
Source File: ICUNormalizer2FilterFactory.java From lucene-solr with Apache License 2.0 | 6 votes |
/** Creates a new ICUNormalizer2FilterFactory */ public ICUNormalizer2FilterFactory(Map<String,String> args) { super(args); String form = get(args, "form", "nfkc_cf"); String mode = get(args, "mode", Arrays.asList("compose", "decompose"), "compose"); Normalizer2 normalizer = Normalizer2.getInstance (null, form, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE); String filter = get(args, "filter"); if (filter != null) { UnicodeSet set = new UnicodeSet(filter); if (!set.isEmpty()) { set.freeze(); normalizer = new FilteredNormalizer2(normalizer, set); } } if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } this.normalizer = normalizer; }
Example #8
Source File: ICUNormalizer2CharFilterFactory.java From lucene-solr with Apache License 2.0 | 6 votes |
/** Creates a new ICUNormalizer2CharFilterFactory */ public ICUNormalizer2CharFilterFactory(Map<String,String> args) { super(args); String form = get(args, "form", "nfkc_cf"); String mode = get(args, "mode", Arrays.asList("compose", "decompose"), "compose"); Normalizer2 normalizer = Normalizer2.getInstance (null, form, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE); String filter = get(args, "filter"); if (filter != null) { UnicodeSet set = new UnicodeSet(filter); if (!set.isEmpty()) { set.freeze(); normalizer = new FilteredNormalizer2(normalizer, set); } } if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } this.normalizer = normalizer; }
Example #9
Source File: ICUFoldingFilterFactory.java From lucene-solr with Apache License 2.0 | 6 votes |
/** Creates a new ICUFoldingFilterFactory */ public ICUFoldingFilterFactory(Map<String,String> args) { super(args); Normalizer2 normalizer = ICUFoldingFilter.NORMALIZER; String filter = get(args, "filter"); if (filter != null) { UnicodeSet set = new UnicodeSet(filter); if (!set.isEmpty()) { set.freeze(); normalizer = new FilteredNormalizer2(normalizer, set); } } if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } this.normalizer = normalizer; }
Example #10
Source File: IcuNormalizerTokenFilterFactory.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
protected Normalizer2.Mode getNormalizationMode(Settings settings) { Normalizer2.Mode normalizationMode; switch (settings.get("normalization_mode", "compose")) { case "compose_contiguous": normalizationMode = Normalizer2.Mode.COMPOSE_CONTIGUOUS; break; case "decompose": normalizationMode = Normalizer2.Mode.DECOMPOSE; break; case "fcd": normalizationMode = Normalizer2.Mode.FCD; break; default: normalizationMode = Normalizer2.Mode.COMPOSE; break; } return normalizationMode; }
Example #11
Source File: CollationBuilder.java From fitnotifications with Apache License 2.0 | 6 votes |
public CollationBuilder(CollationTailoring b) { nfd = Normalizer2.getNFDInstance(); fcd = Norm2AllModes.getFCDNormalizer2(); nfcImpl = Norm2AllModes.getNFCInstance().impl; base = b; baseData = b.data; rootElements = new CollationRootElements(b.data.rootElements); variableTop = 0; dataBuilder = new CollationDataBuilder(); fastLatinEnabled = true; cesLength = 0; rootPrimaryIndexes = new UVector32(); nodes = new UVector64(); nfcImpl.ensureCanonIterData(); dataBuilder.initForTailoring(baseData); }
Example #12
Source File: StringPrototype.java From es6draft with MIT License | 5 votes |
/** * 21.1.3.12 String.prototype.normalize ( [ form ] ) * * @param cx * the execution context * @param thisValue * the function this-value * @param form * the normalisation form * @return the normalized string */ @Function(name = "normalize", arity = 0) public static Object normalize(ExecutionContext cx, Object thisValue, Object form) { /* step 1 */ Object obj = RequireObjectCoercible(cx, thisValue); /* step 2 */ String s = ToFlatString(cx, obj); /* steps 3-4 */ String f = !Type.isUndefined(form) ? ToFlatString(cx, form) : "NFC"; /* step 5 */ Normalizer2 normalizer; switch (f) { case "NFC": normalizer = Normalizer2.getNFCInstance(); break; case "NFD": normalizer = Normalizer2.getNFDInstance(); break; case "NFKC": normalizer = Normalizer2.getNFKCInstance(); break; case "NFKD": normalizer = Normalizer2.getNFKDInstance(); break; default: throw newRangeError(cx, Messages.Key.InvalidNormalizationForm, f); } /* steps 6-7 */ return ensureValidString(cx, () -> normalizer.normalize(s)); }
Example #13
Source File: IcuNormalizerCharFilterFactory.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public IcuNormalizerCharFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name); Normalizer2 base = Normalizer2.getInstance(getNormalizationResource(settings), getNormalizationName(settings), getNormalizationMode(settings)); String unicodeSetFilter = settings.get("unicode_set_filter"); this.normalizer = unicodeSetFilter != null ? new FilteredNormalizer2(base, new UnicodeSet(unicodeSetFilter).freeze()) : base; }
Example #14
Source File: IcuNormalizerTokenFilterFactory.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); Normalizer2 base = Normalizer2.getInstance(getNormalizationResource(settings), getNormalizationName(settings), getNormalizationMode(settings)); String unicodeSetFilter = settings.get("unicode_set_filter"); this.normalizer = unicodeSetFilter != null ? new FilteredNormalizer2(base, new UnicodeSet(unicodeSetFilter).freeze()) : base; }
Example #15
Source File: SegmentationIcuTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
private static Analyzer createAnalyzer() { return new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, new DefaultIcuTokenizerConfig(false, true)); TokenFilter filter = new IcuNormalizerFilter(tokenizer, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); return new TokenStreamComponents(tokenizer, filter); } }; }
Example #16
Source File: IcuNormalizerFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testAlternate() throws Exception { Analyzer a = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new IcuNormalizerFilter( tokenizer, Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE))); } }; assertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" }); a.close(); }
Example #17
Source File: IcuNormalizerFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testEmptyTerm() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); return new TokenStreamComponents(tokenizer, new IcuNormalizerFilter(tokenizer, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE))); } }; checkOneTerm(a, "", ""); a.close(); }
Example #18
Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testNFCHuge() throws Exception { doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE), 256, RANDOM_MULTIPLIER*500, 16); }
Example #19
Source File: UCharacterProperty.java From fitnotifications with Apache License 2.0 | 4 votes |
@Override int getValue(int c) { return Normalizer2.getNFDInstance().getCombiningClass(c); }
Example #20
Source File: ICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0 | 4 votes |
ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int bufferSize) { super(in); this.normalizer = Objects.requireNonNull(normalizer); this.tmpBuffer = CharacterUtils.newCharacterBuffer(bufferSize); }
Example #21
Source File: IcuNormalizerCharFilter.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 4 votes |
private IcuNormalizerCharFilter(Reader in, Normalizer2 normalizer, int bufferSize) { super(in); this.normalizer = Objects.requireNonNull(normalizer); this.tmpBuffer = new char[bufferSize]; }
Example #22
Source File: Normalizer2Factory.java From oacc-core with Apache License 2.0 | 4 votes |
public static Normalizer2 getNFCInstance() { return Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE); }
Example #23
Source File: UCharacterProperty.java From trekarta with GNU General Public License v3.0 | 4 votes |
@Override int getValue(int c) { return Normalizer2.getNFDInstance().getCombiningClass(c); }
Example #24
Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testNFKC_CFHuge() throws Exception { doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 256, RANDOM_MULTIPLIER*500, 16); }
Example #25
Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testNFKC_CF() throws Exception { doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000, 128); }
Example #26
Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testNFKDHuge() throws Exception { doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 256, RANDOM_MULTIPLIER*500, 16); }
Example #27
Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testNFKD() throws Exception { doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 20, RANDOM_MULTIPLIER*1000, 128); }
Example #28
Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testNFKCHuge() throws Exception { doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 256, RANDOM_MULTIPLIER*500, 16); }
Example #29
Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testNFKC() throws Exception { doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000, 128); }
Example #30
Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testNFDHuge() throws Exception { doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE), 256, RANDOM_MULTIPLIER*500, 16); }