com.ibm.icu.text.RuleBasedBreakIterator Java Examples

The following examples show how to use com.ibm.icu.text.RuleBasedBreakIterator. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DefaultICUTokenizerConfig.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public String getType(int script, int ruleStatus) {
  switch (ruleStatus) {
    case RuleBasedBreakIterator.WORD_IDEO:
      return WORD_IDEO;
    case RuleBasedBreakIterator.WORD_KANA:
      return script == UScript.HIRAGANA ? WORD_HIRAGANA : WORD_KATAKANA;
    case RuleBasedBreakIterator.WORD_LETTER:
      return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
    case RuleBasedBreakIterator.WORD_NUMBER:
      return WORD_NUMBER;
    case EMOJI_SEQUENCE_STATUS:
      return WORD_EMOJI;
    default: /* some other custom code */
      return "<OTHER>";
  }
}
 
Example #2
Source File: LineBreaker.java    From ttt with BSD 2-Clause "Simplified" License 6 votes vote down vote up
private LineBreakIterator maybeLoad(Reporter reporter) {
    LineBreakIterator iterator = this.iterator;
    if (iterator != null)
        return iterator;
    else {
        BreakIterator bi = null;
        InputStream is = null;
        try {
            URL rulesLocator = getRulesLocator(name, RULES_BINARY_EXT);
            if (rulesLocator != null) {
                is = rulesLocator.openStream();
                bi = RuleBasedBreakIterator.getInstanceFromCompiledRules(is);
                reporter.logInfo(reporter.message("*KEY*", "Loaded rules based break iterator from ''{0}''.", rulesLocator.toString()));
            } else
                bi = BreakIterator.getCharacterInstance();
        } catch (IOException e) {
        } finally {
            IOUtil.closeSafely(is);
        }
        if (bi != null) {
            return this.iterator = new LineBreakIterator(bi);
        } else
            return null;
    }
}
 
Example #3
Source File: BreakIteratorWrapper.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
private int calcStatus(int current, int next) {
    if (current == BreakIterator.DONE || next == BreakIterator.DONE) {
        return RuleBasedBreakIterator.WORD_NONE;
    }
    int begin = start + current;
    int end = start + next;
    int codepoint;
    for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
        codepoint = UTF16.charAt(text, 0, end, begin);
        if (UCharacter.isDigit(codepoint)) {
            return RuleBasedBreakIterator.WORD_NUMBER;
        } else if (UCharacter.isLetter(codepoint)) {
            return RuleBasedBreakIterator.WORD_LETTER;
        }
    }
    return RuleBasedBreakIterator.WORD_NONE;
}
 
Example #4
Source File: BreakIteratorWrapper.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
void setText(char text[], int start, int length) {
  this.text = text;
  this.start = start;
  textIterator.setText(text, start, length);
  rbbi.setText(textIterator);
  status = RuleBasedBreakIterator.WORD_NONE;
}
 
Example #5
Source File: ICUTokenizerFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private BreakIterator parseRules(String filename, ResourceLoader loader) throws IOException {
  StringBuilder rules = new StringBuilder();
  InputStream rulesStream = loader.openResource(filename);
  BufferedReader reader = new BufferedReader
      (IOUtils.getDecodingReader(rulesStream, StandardCharsets.UTF_8));
  String line = null;
  while ((line = reader.readLine()) != null) {
    if ( ! line.startsWith("#"))
      rules.append(line);
    rules.append('\n');
  }
  reader.close();
  return new RuleBasedBreakIterator(rules.toString());
}
 
Example #6
Source File: DefaultICUTokenizerConfig.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public RuleBasedBreakIterator getBreakIterator(int script) {
  switch(script) {
    case UScript.JAPANESE: return (RuleBasedBreakIterator)cjkBreakIterator.clone();
    case UScript.MYANMAR: 
      if (myanmarAsWords) {
        return (RuleBasedBreakIterator)defaultBreakIterator.clone();
      } else {
        return (RuleBasedBreakIterator)myanmarSyllableIterator.clone();
      }
    default: return (RuleBasedBreakIterator)defaultBreakIterator.clone();
  }
}
 
Example #7
Source File: DefaultICUTokenizerConfig.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private static RuleBasedBreakIterator readBreakIterator(String filename) {
  InputStream is = 
    DefaultICUTokenizerConfig.class.getResourceAsStream(filename);
  try {
    RuleBasedBreakIterator bi = 
      RuleBasedBreakIterator.getInstanceFromCompiledRules(is);
    is.close();
    return bi;
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}
 
Example #8
Source File: RBBIRuleCompiler.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
static void compile(File srcDir, File destDir) throws Exception {
  File files[] = srcDir.listFiles(new FilenameFilter() {
    public boolean accept(File dir, String name) {
      return name.endsWith("rbbi");
    }});
  if (files == null) throw new IOException("Path does not exist: " + srcDir);
  for (int i = 0; i < files.length; i++) {
    File file = files[i];
    File outputFile = new File(destDir, 
        file.getName().replaceAll("rbbi$", "brk"));
    String rules = getRules(file);
    System.err.print("Compiling " + file.getName() + " to "
        + outputFile.getName() + ": ");
    /*
     * if there is a syntax error, compileRules() may succeed. the way to
     * check is to try to instantiate from the string. additionally if the
     * rules are invalid, you can get a useful syntax error.
     */
    try {
      new RuleBasedBreakIterator(rules);
    } catch (IllegalArgumentException e) {
      /*
       * do this intentionally, so you don't get a massive stack trace
       * instead, get a useful syntax error!
       */
      System.err.println(e.getMessage());
      System.exit(1);
    }
    FileOutputStream os = new FileOutputStream(outputFile);
    RuleBasedBreakIterator.compileRules(rules, os);
    os.close();
    System.err.println(outputFile.length() + " bytes.");
  }
}
 
Example #9
Source File: DefaultIcuTokenizerConfig.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
private static RuleBasedBreakIterator readBreakIterator(String resourceName) {
    try (InputStream inputStream = DefaultIcuTokenizerConfig.class.getResourceAsStream(resourceName)) {
        return RuleBasedBreakIterator.getInstanceFromCompiledRules(inputStream);
    } catch (IOException e) {
        throw new UncheckedIOException("unable to load resource " + resourceName + " " + e.getMessage(), e);
    }
}
 
Example #10
Source File: DefaultIcuTokenizerConfig.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
public String getType(int script, int ruleStatus) {
    switch (ruleStatus) {
        case RuleBasedBreakIterator.WORD_IDEO:
            return WORD_IDEO;
        case RuleBasedBreakIterator.WORD_KANA:
            return script == UScript.HIRAGANA ? WORD_HIRAGANA : WORD_KATAKANA;
        case RuleBasedBreakIterator.WORD_LETTER:
            return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
        case RuleBasedBreakIterator.WORD_NUMBER:
            return WORD_NUMBER;
        default: /* some other custom code */
            return "<OTHER>";
    }
}
 
Example #11
Source File: BreakIteratorWrapper.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * If its a RuleBasedBreakIterator, the rule status can be used for token type. If its
 * any other BreakIterator, the rulestatus method is not available, so treat
 * it like a generic BreakIterator.
 */
static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
    if (breakIterator instanceof RuleBasedBreakIterator) {
        return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
    } else {
        return new BIWrapper(breakIterator);
    }
}
 
Example #12
Source File: RBBIRuleCompiler.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void compile(InputStream inputStream, OutputStream outputStream) throws IOException {
    String rules = getRules(inputStream);
    try (OutputStream os = outputStream) {
        new RuleBasedBreakIterator(rules);
        RuleBasedBreakIterator.compileRules(rules, os);
    } catch (IllegalArgumentException e) {
        logger.error(e.getMessage(), e);
    }
}
 
Example #13
Source File: BreakIteratorWrapper.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
BreakIteratorWrapper(RuleBasedBreakIterator rbbi) {
  this.rbbi = rbbi;
}
 
Example #14
Source File: ICUTokenizerConfig.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/** Return a breakiterator capable of processing a given script. */
public abstract RuleBasedBreakIterator getBreakIterator(int script);
 
Example #15
Source File: BreakIteratorWrapper.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 4 votes vote down vote up
RBBIWrapper(RuleBasedBreakIterator rbbi) {
    this.rbbi = rbbi;
}
 
Example #16
Source File: BreakIteratorWrapper.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
void setText(CharacterIterator text) {
    bi.setText(text);
    status = RuleBasedBreakIterator.WORD_NONE;
}