Java Code Examples for com.ibm.icu.text.Normalizer2

The following examples show how to use com.ibm.icu.text.Normalizer2. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: fitnotifications   Source File: CollationBuilder.java    License: Apache License 2.0 6 votes vote down vote up
public CollationBuilder(CollationTailoring b) {
    nfd = Normalizer2.getNFDInstance();
    fcd = Norm2AllModes.getFCDNormalizer2();
    nfcImpl = Norm2AllModes.getNFCInstance().impl;
    base = b;
    baseData = b.data;
    rootElements = new CollationRootElements(b.data.rootElements);
    variableTop = 0;
    dataBuilder = new CollationDataBuilder();
    fastLatinEnabled = true;
    cesLength = 0;
    rootPrimaryIndexes = new UVector32();
    nodes = new UVector64();
    nfcImpl.ensureCanonIterData();
    dataBuilder.initForTailoring(baseData);
}
 
Example 2
Source Project: lucene-solr   Source File: ICUFoldingFilterFactory.java    License: Apache License 2.0 6 votes vote down vote up
/** Creates a new ICUFoldingFilterFactory */
public ICUFoldingFilterFactory(Map<String,String> args) {
  super(args);

  Normalizer2 normalizer = ICUFoldingFilter.NORMALIZER;
  String filter = get(args, "filter");
  if (filter != null) {
    UnicodeSet set = new UnicodeSet(filter);
    if (!set.isEmpty()) {
      set.freeze();
      normalizer = new FilteredNormalizer2(normalizer, set);
    }
  }
  if (!args.isEmpty()) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
  this.normalizer = normalizer;
}
 
Example 3
/** Creates a new ICUNormalizer2CharFilterFactory */
public ICUNormalizer2CharFilterFactory(Map<String,String> args) {
  super(args);
  String form = get(args, "form", "nfkc_cf");
  String mode = get(args, "mode", Arrays.asList("compose", "decompose"), "compose");
  Normalizer2 normalizer = Normalizer2.getInstance
      (null, form, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
  
  String filter = get(args, "filter");
  if (filter != null) {
    UnicodeSet set = new UnicodeSet(filter);
    if (!set.isEmpty()) {
      set.freeze();
      normalizer = new FilteredNormalizer2(normalizer, set);
    }
  }
  if (!args.isEmpty()) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
  this.normalizer = normalizer;
}
 
Example 4
Source Project: lucene-solr   Source File: ICUNormalizer2FilterFactory.java    License: Apache License 2.0 6 votes vote down vote up
/** Creates a new ICUNormalizer2FilterFactory */
public ICUNormalizer2FilterFactory(Map<String,String> args) {
  super(args);
  String form = get(args, "form", "nfkc_cf");
  String mode = get(args, "mode", Arrays.asList("compose", "decompose"), "compose");
  Normalizer2 normalizer = Normalizer2.getInstance
      (null, form, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
  
  String filter = get(args, "filter");
  if (filter != null) {
    UnicodeSet set = new UnicodeSet(filter);
    if (!set.isEmpty()) {
      set.freeze();
      normalizer = new FilteredNormalizer2(normalizer, set);
    }
  }
  if (!args.isEmpty()) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
  this.normalizer = normalizer;
}
 
Example 5
Source Project: lucene-solr   Source File: TestICUNormalizer2Filter.java    License: Apache License 2.0 6 votes vote down vote up
public void testAlternate() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    public TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(
          tokenizer,
          /* specify nfc with decompose to get nfd */
          Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE)));
    }
  };
  
  // decompose EAcute into E + combining Acute
  assertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" });
  a.close();
}
 
Example 6
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testNormalization() throws IOException {
  String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
  Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
  String expectedOutput = normalizer.normalize(input);

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer);
  char[] tempBuff = new char[10];
  StringBuilder output = new StringBuilder();
  while (true) {
    int length = reader.read(tempBuff);
    if (length == -1) {
      break;
    }
    output.append(tempBuff, 0, length);
    assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length()))));
  }

  assertEquals(expectedOutput, output.toString());
}
 
Example 7
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testTokenStream() throws IOException {
  // '℃', '№', '㈱', '㌘', 'サ'+'<<', 'ソ'+'<<', '㌰'+'<<'
  String input = "℃ № ㈱ ㌘ ザ ゾ ㌰゙";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"°C", "No", "(株)", "グラム", "ザ", "ゾ", "ピゴ"},
    new int[] {0, 2, 4, 6, 8, 11, 14},
    new int[] {1, 3, 5, 7, 10, 13, 16},
    input.length());
}
 
Example 8
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testTokenStream2() throws IOException {
  // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<'
  String input = "㌰゙5℃№㈱㌘ザゾ";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), 1, 1);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"},
    new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9},
    new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11},
    input.length()
  );
}
 
Example 9
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testMassiveLigature() throws IOException {
  String input = "\uFDFA";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"صلى", "الله", "عليه", "وسلم"},
    new int[]{0, 0, 0, 0},
    new int[]{0, 0, 0, 1},
    input.length()
  );
}
 
Example 10
protected Normalizer2.Mode getNormalizationMode(Settings settings) {
    Normalizer2.Mode normalizationMode;
    switch (settings.get("normalization_mode", "compose")) {
        case "compose_contiguous":
            normalizationMode = Normalizer2.Mode.COMPOSE_CONTIGUOUS;
            break;
        case "decompose":
            normalizationMode = Normalizer2.Mode.DECOMPOSE;
            break;
        case "fcd":
            normalizationMode = Normalizer2.Mode.FCD;
            break;
        default:
            normalizationMode = Normalizer2.Mode.COMPOSE;
            break;
    }
    return normalizationMode;
}
 
Example 11
protected Normalizer2.Mode getNormalizationMode(Settings settings) {
    Normalizer2.Mode normalizationMode;
    switch (settings.get("normalization_mode", "compose")) {
        case "compose_contiguous":
            normalizationMode = Normalizer2.Mode.COMPOSE_CONTIGUOUS;
            break;
        case "decompose":
            normalizationMode = Normalizer2.Mode.DECOMPOSE;
            break;
        case "fcd":
            normalizationMode = Normalizer2.Mode.FCD;
            break;
        default:
            normalizationMode = Normalizer2.Mode.COMPOSE;
            break;
    }
    return normalizationMode;
}
 
Example 12
public IcuNormalizerCharFilterFactory(IndexSettings indexSettings, Environment environment, String name,
                                      Settings settings) {
    super(indexSettings, name);
    Normalizer2 base = Normalizer2.getInstance(getNormalizationResource(settings),
            getNormalizationName(settings), getNormalizationMode(settings));
    String unicodeSetFilter = settings.get("unicode_set_filter");
    this.normalizer = unicodeSetFilter != null ?
            new FilteredNormalizer2(base, new UnicodeSet(unicodeSetFilter).freeze()) : base;
}
 
Example 13
public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
                                       Settings settings) {
    super(indexSettings, name, settings);

    Normalizer2 base = Normalizer2.getInstance(getNormalizationResource(settings),
            getNormalizationName(settings), getNormalizationMode(settings));

    String unicodeSetFilter = settings.get("unicode_set_filter");
    this.normalizer = unicodeSetFilter != null ?
            new FilteredNormalizer2(base, new UnicodeSet(unicodeSetFilter).freeze()) : base;
}
 
Example 14
private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenFilter filter = new IcuNormalizerFilter(tokenizer,
                    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}
 
Example 15
public void testAlternate() throws Exception {
    Analyzer a = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new IcuNormalizerFilter(
                    tokenizer,
                    Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE)));
        }
    };
    assertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" });
    a.close();
}
 
Example 16
public void testEmptyTerm() throws Exception {
    Analyzer a = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer,
                    new IcuNormalizerFilter(tokenizer,
                            Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)));
        }
    };
    checkOneTerm(a, "", "");
    a.close();
}
 
Example 17
Source Project: es6draft   Source File: StringPrototype.java    License: MIT License 5 votes vote down vote up
/**
 * 21.1.3.12 String.prototype.normalize ( [ form ] )
 * 
 * @param cx
 *            the execution context
 * @param thisValue
 *            the function this-value
 * @param form
 *            the normalisation form
 * @return the normalized string
 */
@Function(name = "normalize", arity = 0)
public static Object normalize(ExecutionContext cx, Object thisValue, Object form) {
    /* step 1 */
    Object obj = RequireObjectCoercible(cx, thisValue);
    /* step 2 */
    String s = ToFlatString(cx, obj);
    /* steps 3-4 */
    String f = !Type.isUndefined(form) ? ToFlatString(cx, form) : "NFC";
    /* step 5 */
    Normalizer2 normalizer;
    switch (f) {
    case "NFC":
        normalizer = Normalizer2.getNFCInstance();
        break;
    case "NFD":
        normalizer = Normalizer2.getNFDInstance();
        break;
    case "NFKC":
        normalizer = Normalizer2.getNFKCInstance();
        break;
    case "NFKD":
        normalizer = Normalizer2.getNFKDInstance();
        break;
    default:
        throw newRangeError(cx, Messages.Key.InvalidNormalizationForm, f);
    }
    /* steps 6-7 */
    return ensureValidString(cx, () -> normalizer.normalize(s));
}
 
Example 18
Source Project: fitnotifications   Source File: UCharacterProperty.java    License: Apache License 2.0 4 votes vote down vote up
@Override
int getValue(int c) {
    return Normalizer2.getNFDInstance().getCombiningClass(c);
}
 
Example 19
Source Project: lucene-solr   Source File: ICUNormalizer2CharFilter.java    License: Apache License 2.0 4 votes vote down vote up
ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int bufferSize) {
  super(in);
  this.normalizer = Objects.requireNonNull(normalizer);
  this.tmpBuffer = CharacterUtils.newCharacterBuffer(bufferSize);
}
 
Example 20
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void testNFC() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000, 128);
}
 
Example 21
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void testNFCHuge() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE), 256, RANDOM_MULTIPLIER*500, 16);
}
 
Example 22
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void testNFD() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE), 20, RANDOM_MULTIPLIER*1000, 128);
}
 
Example 23
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void testNFDHuge() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE), 256, RANDOM_MULTIPLIER*500, 16);
}
 
Example 24
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void testNFKC() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000, 128);
}
 
Example 25
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void testNFKCHuge() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 256, RANDOM_MULTIPLIER*500, 16);
}
 
Example 26
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void testNFKD() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 20, RANDOM_MULTIPLIER*1000, 128);
}
 
Example 27
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void testNFKDHuge() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 256, RANDOM_MULTIPLIER*500, 16);
}
 
Example 28
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void testNFKC_CF() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000, 128);
}
 
Example 29
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void testNFKC_CFHuge() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 256, RANDOM_MULTIPLIER*500, 16);
}
 
Example 30
Source Project: trekarta   Source File: UCharacterProperty.java    License: GNU General Public License v3.0 4 votes vote down vote up
@Override
int getValue(int c) {
    return Normalizer2.getNFDInstance().getCombiningClass(c);
}