com.ibm.icu.text.Transliterator Java Examples

The following examples show how to use com.ibm.icu.text.Transliterator. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ICUTransformFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Create a new ICUTransformFilter that transforms text on the given stream.
 * 
 * @param input {@link TokenStream} to filter.
 * @param transform Transliterator to transform the text.
 */
@SuppressWarnings("deprecation")
public ICUTransformFilter(TokenStream input, Transliterator transform) {
  super(input);
  this.transform = transform;

  /* 
   * This is cheating, but speeds things up a lot.
   * If we wanted to use pkg-private APIs we could probably do better.
   */
  if (transform.getFilter() == null && transform instanceof com.ibm.icu.text.RuleBasedTransliterator) {
    final UnicodeSet sourceSet = transform.getSourceSet();
    if (sourceSet != null && !sourceSet.isEmpty())
      transform.setFilter(sourceSet);
  }
}
 
Example #2
Source File: IcuTransformTokenFilterFactory.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public IcuTransformTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
                                      Settings settings) {
    super(indexSettings, name, settings);
    String id = settings.get("id", "Null");
    String direction = settings.get("dir", "forward");
    int dir = "forward".equals(direction) ? Transliterator.FORWARD : Transliterator.REVERSE;
    String rules = settings.get("rules");
    this.transliterator = rules != null ?
            Transliterator.createFromRules(id, rules, dir) :
            Transliterator.getInstance(id, dir);
    String unicodeSetFilter = settings.get("unicodeSetFilter");
    if (unicodeSetFilter != null) {
        transliterator.setFilter(new UnicodeSet(unicodeSetFilter).freeze());
    }
}
 
Example #3
Source File: TestICUTransformFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEmptyTerm() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new KeywordTokenizer();
      return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, Transliterator.getInstance("Any-Latin")));
    }
  };
  checkOneTerm(a, "", "");
  a.close();
}
 
Example #4
Source File: TestICUTransformFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  final Transliterator transform = Transliterator.getInstance("Any-Latin");
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, transform));
    }
  };
  checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
  a.close();
}
 
Example #5
Source File: TestICUTransformFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testOptimizerSurrogate() throws Exception {
  String rules = "\\U00020087 > x;"; // convert CJK UNIFIED IDEOGRAPH-20087 to an x
  Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
  assertTrue(custom.getFilter() == null);
  final KeywordTokenizer input = new KeywordTokenizer();
  input.setReader(new StringReader(""));
  new ICUTransformFilter(input, custom);
  assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]")));
}
 
Example #6
Source File: TestICUTransformFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testOptimizer() throws Exception {
  String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
  Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
  assertTrue(custom.getFilter() == null);
  final KeywordTokenizer input = new KeywordTokenizer();
  input.setReader(new StringReader(""));
  new ICUTransformFilter(input, custom);
  assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]")));
}
 
Example #7
Source File: UtilityExtensions.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
/**
 * For debugging purposes; format the given text in the form
 * aaa{bbb|ccc|ddd}eee, where the {} indicate the context start
 * and limit, and the || indicate the start and limit.
 */
public static String formatInput(ReplaceableString input,
                                 Transliterator.Position pos) {
    StringBuffer appendTo = new StringBuffer();
    formatInput(appendTo, input, pos);
    return com.ibm.icu.impl.Utility.escape(appendTo.toString());
}
 
Example #8
Source File: UtilityExtensions.java    From fitnotifications with Apache License 2.0 5 votes vote down vote up
/**
 * For debugging purposes; format the given text in the form
 * aaa{bbb|ccc|ddd}eee, where the {} indicate the context start
 * and limit, and the || indicate the start and limit.
 */
public static StringBuffer formatInput(StringBuffer appendTo,
                                       ReplaceableString input,
                                       Transliterator.Position pos) {
    if (0 <= pos.contextStart &&
        pos.contextStart <= pos.start &&
        pos.start <= pos.limit &&
        pos.limit <= pos.contextLimit &&
        pos.contextLimit <= input.length()) {

        String  b, c, d;
        //a = input.substring(0, pos.contextStart);
        b = input.substring(pos.contextStart, pos.start);
        c = input.substring(pos.start, pos.limit);
        d = input.substring(pos.limit, pos.contextLimit);
        //e = input.substring(pos.contextLimit, input.length());
        appendTo.//append(a).
            append('{').append(b).
            append('|').append(c).append('|').append(d).
            append('}')
            //.append(e)
            ;
    } else {
        appendTo.append("INVALID Position {cs=" +
                        pos.contextStart + ", s=" + pos.start + ", l=" +
                        pos.limit + ", cl=" + pos.contextLimit + "} on " +
                        input);
    }
    return appendTo;
}
 
Example #9
Source File: IcuNormalizer.java    From enkan with Eclipse Public License 1.0 5 votes vote down vote up
public IcuNormalizer(String translitId) {
    try {
        transliterator = Transliterator.getInstance(translitId);
    } catch (IllegalArgumentException ex) {
        throw new MisconfigurationException("ILLEGAL_TRANSILIT_ID", translitId, ex);
    }
}
 
Example #10
Source File: ICUTransformFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Creates a new ICUTransformFilterFactory */
public ICUTransformFilterFactory(Map<String,String> args) {
  super(args);
  String id = require(args, "id");
  String direction = get(args, "direction", Arrays.asList("forward", "reverse"), "forward", false);
  int dir = "forward".equals(direction) ? Transliterator.FORWARD : Transliterator.REVERSE;
  transliterator = Transliterator.getInstance(id, dir);
  if (!args.isEmpty()) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
}
 
Example #11
Source File: TestICUTransformFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testBasicFunctionality() throws Exception {
  checkToken(Transliterator.getInstance("Traditional-Simplified"), 
      "簡化字", "简化字"); 
  checkToken(Transliterator.getInstance("Katakana-Hiragana"), 
      "ヒラガナ", "ひらがな");
  checkToken(Transliterator.getInstance("Fullwidth-Halfwidth"), 
      "アルアノリウ", "アルアノリウ");
  checkToken(Transliterator.getInstance("Any-Latin"), 
      "Αλφαβητικός Κατάλογος", "Alphabētikós Katálogos");
  checkToken(Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove"), 
      "Alphabētikós Katálogos", "Alphabetikos Katalogos");
  checkToken(Transliterator.getInstance("Han-Latin"),
      "中国", "zhōng guó");
}
 
Example #12
Source File: TransliterationTest.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
@Test
public void transliterateRussian() throws Exception {
    assertEquals("alfavit", Transliterator.getInstance("Any-Latin").transform("алфавит"));
}
 
Example #13
Source File: TestICUTransformFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void checkToken(Transliterator transform, String input, String expected) throws IOException {
  final KeywordTokenizer input1 = new KeywordTokenizer();
  input1.setReader(new StringReader(input));
  TokenStream ts = new ICUTransformFilter(input1, transform);
  assertTokenStreamContents(ts, new String[] { expected });
}
 
Example #14
Source File: TestICUTransformFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testOptimizer2() throws Exception {
  checkToken(Transliterator.getInstance("Traditional-Simplified; CaseFold"), 
      "ABCDE", "abcde");
}
 
Example #15
Source File: TestICUTransformFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testCustomFunctionality2() throws Exception {
  String rules = "c { a > b; a > d;"; // convert a's to b's and b's to c's
  checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "caa", "cbd");
}
 
Example #16
Source File: TestICUTransformFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testCustomFunctionality() throws Exception {
  String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
  checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "abacadaba", "bcbcbdbcb");
}
 
Example #17
Source File: UtilityExtensions.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
/**
 * Convenience method.
 */
public static StringBuffer formatInput(StringBuffer appendTo,
                                       Replaceable input,
                                       Transliterator.Position pos) {
    return formatInput(appendTo, (ReplaceableString) input, pos);
}
 
Example #18
Source File: UtilityExtensions.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
/**
 * Convenience method.
 */
public static String formatInput(Replaceable input,
                                 Transliterator.Position pos) {
    return formatInput((ReplaceableString) input, pos);
}
 
Example #19
Source File: TransliterationTest.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
@Test
public void transliterateChinese() throws Exception {
    assertEquals("zì mǔ", Transliterator.getInstance("Any-Latin").transform("字母"));
}
 
Example #20
Source File: TransliterationTest.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
@Test
public void transliterateKorean() throws Exception {
    assertEquals("alpabes", Transliterator.getInstance("Any-Latin").transform("알파벳"));
}
 
Example #21
Source File: TransliterationTest.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
@Test
public void transliterateArabic() throws Exception {
    assertEquals("ạlạ̉bjdyẗ", Transliterator.getInstance("Any-Latin").transform("الأبجدية"));
}
 
Example #22
Source File: TransliterationTest.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
@Test
@Ignore(value = "Transliterating Thai is not supported")
public void transliterateThai() throws Exception {
    assertEquals("tạw xạks̄ʹr", Transliterator.getInstance("Any-Latin").transform("ตัวอักษร"));
}
 
Example #23
Source File: TransliterationTest.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
@Test
public void transliterateGreek() throws Exception {
    assertEquals("Alphabētikós", Transliterator.getInstance("Any-Latin").transform("Αλφαβητικός"));
}
 
Example #24
Source File: TransliterationTest.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
@Test
public void transliterateJapanese() throws Exception {
    assertEquals("arufabetto", Transliterator.getInstance("Any-Latin").transform("アルファベット"));
}
 
Example #25
Source File: SlugService.java    From mapr-music with Apache License 2.0 4 votes vote down vote up
@Inject
public SlugService(@Named("artistDao") MaprDbDao<Artist> artistDao, @Named("albumDao") MaprDbDao<Album> albumDao) {
    this.artistDao = artistDao;
    this.albumDao = albumDao;
    this.transliterator = Transliterator.getInstance(ICU4J_TRANSLITERATOR_ID);
}
 
Example #26
Source File: IcuTransformTokenFilter.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 2 votes vote down vote up
/**
 * Create a new IcuTransformFilter that transforms text on the given stream.
 *
 * @param input {@link TokenStream} to filter.
 * @param transliterator Transliterator to transform the text.
 */
public IcuTransformTokenFilter(TokenStream input, Transliterator transliterator) {
    super(input);
    this.transliterator = transliterator;
}
 
Example #27
Source File: BeanUtils.java    From apiman with Apache License 2.0 2 votes vote down vote up
/**
 * Creates a bean id from the given bean name.
 * @param name the name
 * @return the id
 */
public static final String idFromName(String name) {
    Transliterator tr = Transliterator.getInstance("Any-Latin; Latin-ASCII"); //$NON-NLS-1$
    return removeNonWord(tr.transliterate(name));
}