com.ibm.icu.text.Normalizer2 Java Exaples

Source File: IcuNormalizerCharFilterFactory.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

protected Normalizer2.Mode getNormalizationMode(Settings settings) {
    Normalizer2.Mode normalizationMode;
    switch (settings.get("normalization_mode", "compose")) {
        case "compose_contiguous":
            normalizationMode = Normalizer2.Mode.COMPOSE_CONTIGUOUS;
            break;
        case "decompose":
            normalizationMode = Normalizer2.Mode.DECOMPOSE;
            break;
        case "fcd":
            normalizationMode = Normalizer2.Mode.FCD;
            break;
        default:
            normalizationMode = Normalizer2.Mode.COMPOSE;
            break;
    }
    return normalizationMode;
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testMassiveLigature() throws IOException {
  String input = "\uFDFA";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"صلى", "الله", "عليه", "وسلم"},
    new int[]{0, 0, 0, 0},
    new int[]{0, 0, 0, 1},
    input.length()
  );
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testTokenStream2() throws IOException {
  // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'ｻ', '<<', 'ｿ', '<<'
  String input = "㌰゙5℃№㈱㌘ｻﾞｿﾞ";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), 1, 1);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"},
    new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9},
    new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11},
    input.length()
  );
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testTokenStream() throws IOException {
  // '℃', '№', '㈱', '㌘', 'ｻ'+'<<', 'ｿ'+'<<', '㌰'+'<<'
  String input = "℃ № ㈱ ㌘ ｻﾞ ｿﾞ ㌰ﾞ";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"°C", "No", "(株)", "グラム", "ザ", "ゾ", "ピゴ"},
    new int[] {0, 2, 4, 6, 8, 11, 14},
    new int[] {1, 3, 5, 7, 10, 13, 16},
    input.length());
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testNormalization() throws IOException {
  String input = "ʰ㌰゙5℃№㈱㌘，バッファーの正規化のテスト．㋐㋑㋒㋓㋔ｶｷｸｹｺｻﾞｼﾞｽﾞｾﾞｿﾞg̈각/각நிเกषिchkʷक्षि";
  Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
  String expectedOutput = normalizer.normalize(input);

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer);
  char[] tempBuff = new char[10];
  StringBuilder output = new StringBuilder();
  while (true) {
    int length = reader.read(tempBuff);
    if (length == -1) {
      break;
    }
    output.append(tempBuff, 0, length);
    assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length()))));
  }

  assertEquals(expectedOutput, output.toString());
}

Source File: TestICUNormalizer2Filter.java From lucene-solr with Apache License 2.0

6 votes

public void testAlternate() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    public TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(
          tokenizer,
          /* specify nfc with decompose to get nfd */
          Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE)));
    }
  };
  
  // decompose EAcute into E + combining Acute
  assertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" });
  a.close();
}

Source File: ICUNormalizer2FilterFactory.java From lucene-solr with Apache License 2.0

6 votes

/** Creates a new ICUNormalizer2FilterFactory */
public ICUNormalizer2FilterFactory(Map<String,String> args) {
  super(args);
  String form = get(args, "form", "nfkc_cf");
  String mode = get(args, "mode", Arrays.asList("compose", "decompose"), "compose");
  Normalizer2 normalizer = Normalizer2.getInstance
      (null, form, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
  
  String filter = get(args, "filter");
  if (filter != null) {
    UnicodeSet set = new UnicodeSet(filter);
    if (!set.isEmpty()) {
      set.freeze();
      normalizer = new FilteredNormalizer2(normalizer, set);
    }
  }
  if (!args.isEmpty()) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
  this.normalizer = normalizer;
}

Source File: ICUNormalizer2CharFilterFactory.java From lucene-solr with Apache License 2.0

6 votes

/** Creates a new ICUNormalizer2CharFilterFactory */
public ICUNormalizer2CharFilterFactory(Map<String,String> args) {
  super(args);
  String form = get(args, "form", "nfkc_cf");
  String mode = get(args, "mode", Arrays.asList("compose", "decompose"), "compose");
  Normalizer2 normalizer = Normalizer2.getInstance
      (null, form, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
  
  String filter = get(args, "filter");
  if (filter != null) {
    UnicodeSet set = new UnicodeSet(filter);
    if (!set.isEmpty()) {
      set.freeze();
      normalizer = new FilteredNormalizer2(normalizer, set);
    }
  }
  if (!args.isEmpty()) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
  this.normalizer = normalizer;
}

Source File: ICUFoldingFilterFactory.java From lucene-solr with Apache License 2.0

6 votes

/** Creates a new ICUFoldingFilterFactory */
public ICUFoldingFilterFactory(Map<String,String> args) {
  super(args);

  Normalizer2 normalizer = ICUFoldingFilter.NORMALIZER;
  String filter = get(args, "filter");
  if (filter != null) {
    UnicodeSet set = new UnicodeSet(filter);
    if (!set.isEmpty()) {
      set.freeze();
      normalizer = new FilteredNormalizer2(normalizer, set);
    }
  }
  if (!args.isEmpty()) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
  this.normalizer = normalizer;
}

Source File: IcuNormalizerTokenFilterFactory.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

protected Normalizer2.Mode getNormalizationMode(Settings settings) {
    Normalizer2.Mode normalizationMode;
    switch (settings.get("normalization_mode", "compose")) {
        case "compose_contiguous":
            normalizationMode = Normalizer2.Mode.COMPOSE_CONTIGUOUS;
            break;
        case "decompose":
            normalizationMode = Normalizer2.Mode.DECOMPOSE;
            break;
        case "fcd":
            normalizationMode = Normalizer2.Mode.FCD;
            break;
        default:
            normalizationMode = Normalizer2.Mode.COMPOSE;
            break;
    }
    return normalizationMode;
}

Source File: CollationBuilder.java From fitnotifications with Apache License 2.0

6 votes

public CollationBuilder(CollationTailoring b) {
    nfd = Normalizer2.getNFDInstance();
    fcd = Norm2AllModes.getFCDNormalizer2();
    nfcImpl = Norm2AllModes.getNFCInstance().impl;
    base = b;
    baseData = b.data;
    rootElements = new CollationRootElements(b.data.rootElements);
    variableTop = 0;
    dataBuilder = new CollationDataBuilder();
    fastLatinEnabled = true;
    cesLength = 0;
    rootPrimaryIndexes = new UVector32();
    nodes = new UVector64();
    nfcImpl.ensureCanonIterData();
    dataBuilder.initForTailoring(baseData);
}

Source File: StringPrototype.java From es6draft with MIT License

5 votes

/**
 * 21.1.3.12 String.prototype.normalize ( [ form ] )
 * 
 * @param cx
 *            the execution context
 * @param thisValue
 *            the function this-value
 * @param form
 *            the normalisation form
 * @return the normalized string
 */
@Function(name = "normalize", arity = 0)
public static Object normalize(ExecutionContext cx, Object thisValue, Object form) {
    /* step 1 */
    Object obj = RequireObjectCoercible(cx, thisValue);
    /* step 2 */
    String s = ToFlatString(cx, obj);
    /* steps 3-4 */
    String f = !Type.isUndefined(form) ? ToFlatString(cx, form) : "NFC";
    /* step 5 */
    Normalizer2 normalizer;
    switch (f) {
    case "NFC":
        normalizer = Normalizer2.getNFCInstance();
        break;
    case "NFD":
        normalizer = Normalizer2.getNFDInstance();
        break;
    case "NFKC":
        normalizer = Normalizer2.getNFKCInstance();
        break;
    case "NFKD":
        normalizer = Normalizer2.getNFKDInstance();
        break;
    default:
        throw newRangeError(cx, Messages.Key.InvalidNormalizationForm, f);
    }
    /* steps 6-7 */
    return ensureValidString(cx, () -> normalizer.normalize(s));
}

Source File: IcuNormalizerCharFilterFactory.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

public IcuNormalizerCharFilterFactory(IndexSettings indexSettings, Environment environment, String name,
                                      Settings settings) {
    super(indexSettings, name);
    Normalizer2 base = Normalizer2.getInstance(getNormalizationResource(settings),
            getNormalizationName(settings), getNormalizationMode(settings));
    String unicodeSetFilter = settings.get("unicode_set_filter");
    this.normalizer = unicodeSetFilter != null ?
            new FilteredNormalizer2(base, new UnicodeSet(unicodeSetFilter).freeze()) : base;
}

Source File: IcuNormalizerTokenFilterFactory.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
                                       Settings settings) {
    super(indexSettings, name, settings);

    Normalizer2 base = Normalizer2.getInstance(getNormalizationResource(settings),
            getNormalizationName(settings), getNormalizationMode(settings));

    String unicodeSetFilter = settings.get("unicode_set_filter");
    this.normalizer = unicodeSetFilter != null ?
            new FilteredNormalizer2(base, new UnicodeSet(unicodeSetFilter).freeze()) : base;
}

Source File: SegmentationIcuTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenFilter filter = new IcuNormalizerFilter(tokenizer,
                    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}

Source File: IcuNormalizerFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

public void testAlternate() throws Exception {
    Analyzer a = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new IcuNormalizerFilter(
                    tokenizer,
                    Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE)));
        }
    };
    assertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" });
    a.close();
}

Source File: IcuNormalizerFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

public void testEmptyTerm() throws Exception {
    Analyzer a = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer,
                    new IcuNormalizerFilter(tokenizer,
                            Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)));
        }
    };
    checkOneTerm(a, "", "");
    a.close();
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

4 votes

public void testNFCHuge() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE), 256, RANDOM_MULTIPLIER*500, 16);
}

Source File: UCharacterProperty.java From fitnotifications with Apache License 2.0

4 votes

@Override
int getValue(int c) {
    return Normalizer2.getNFDInstance().getCombiningClass(c);
}

Source File: ICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

4 votes

ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int bufferSize) {
  super(in);
  this.normalizer = Objects.requireNonNull(normalizer);
  this.tmpBuffer = CharacterUtils.newCharacterBuffer(bufferSize);
}

Source File: IcuNormalizerCharFilter.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

4 votes

private IcuNormalizerCharFilter(Reader in, Normalizer2 normalizer, int bufferSize) {
    super(in);
    this.normalizer = Objects.requireNonNull(normalizer);
    this.tmpBuffer = new char[bufferSize];
}

Source File: Normalizer2Factory.java From oacc-core with Apache License 2.0

4 votes

public static Normalizer2 getNFCInstance() {
   return Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE);
}

Source File: UCharacterProperty.java From trekarta with GNU General Public License v3.0

4 votes

@Override
int getValue(int c) {
    return Normalizer2.getNFDInstance().getCombiningClass(c);
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

4 votes

public void testNFKC_CFHuge() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 256, RANDOM_MULTIPLIER*500, 16);
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

4 votes

public void testNFKC_CF() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000, 128);
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

4 votes

public void testNFKDHuge() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 256, RANDOM_MULTIPLIER*500, 16);
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

4 votes

public void testNFKD() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 20, RANDOM_MULTIPLIER*1000, 128);
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

4 votes

public void testNFKCHuge() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 256, RANDOM_MULTIPLIER*500, 16);
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

4 votes

public void testNFKC() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000, 128);
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

4 votes

public void testNFDHuge() throws Exception {
  doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE), 256, RANDOM_MULTIPLIER*500, 16);
}

com.ibm.icu.text.Normalizer2 Java Examples