org.apache.lucene.analysis.compound.hyphenation.HyphenationTree Java Examples

The following examples show how to use org.apache.lucene.analysis.compound.hyphenation.HyphenationTree. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: TestCompoundWordTokenFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testHyphenationCompoundWordsDA() throws Exception {
  CharArraySet dict = makeDictionary("læse", "hest");

  InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
  HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
      .getHyphenationTree(is);

  HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
      whitespaceMockTokenizer("min veninde som er lidt af en læsehest"),
      hyphenator,
      dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
  assertTokenStreamContents(tf, 
      new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" },
      new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 }
  );
}

Example #2

Source File: TestCompoundWordTokenFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
  CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv");

  InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
  HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
      .getHyphenationTree(is);

  // the word basket will not be added due to the longest match option
  HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
      whitespaceMockTokenizer("basketballkurv"),
      hyphenator, dict,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
  assertTokenStreamContents(tf, 
      new String[] { "basketballkurv", "basketball", "ball", "kurv" },
      new int[] { 1, 0, 0, 0 }
  );

}

Example #3

Source File: TestCompoundWordTokenFilter.java From lucene-solr with Apache License 2.0

5 votes

public void testLucene8124() throws Exception {
  InputSource is = new InputSource(getClass().getResource("hyphenation-LUCENE-8124.xml").toExternalForm());
  HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
      .getHyphenationTree(is);

  HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
      whitespaceMockTokenizer(
              "Rindfleisch"),
      hyphenator);

  // TODO Rindfleisch returned twice is another issue of the HyphenationCompoundTokenFilter 
  assertTokenStreamContents(tf, new String[] { "Rindfleisch", "Rind", "Rindfleisch", "fleisch"});
}

Example #4

Source File: TestCompoundWordTokenFilter.java From lucene-solr with Apache License 2.0

4 votes

/**
 * With hyphenation-only, you can get a lot of nonsense tokens.
 * This can be controlled with the min/max subword size.
 */
public void testHyphenationOnly() throws Exception {
  InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
  HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
      .getHyphenationTree(is);
  
  HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(

      whitespaceMockTokenizer("basketballkurv"),
      hyphenator,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      2, 4);
  
  // min=2, max=4
  assertTokenStreamContents(tf,
      new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" }
  );
  
  tf = new HyphenationCompoundWordTokenFilter(

      whitespaceMockTokenizer("basketballkurv"),
      hyphenator,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      4, 6);
  
  // min=4, max=6
  assertTokenStreamContents(tf,
      new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" }
  );
  
  tf = new HyphenationCompoundWordTokenFilter(

      whitespaceMockTokenizer("basketballkurv"),
      hyphenator,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      4, 10);
  
  // min=4, max=10
  assertTokenStreamContents(tf,
      new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", 
                     "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" }
  );
  
}

Example #5

Source File: HyphenationCompoundWordTokenFilter.java From lucene-solr with Apache License 2.0

3 votes

/**
 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
 *
 * @param input
 *          the {@link org.apache.lucene.analysis.TokenStream} to process
 * @param hyphenator
 *          the hyphenation pattern tree to use for hyphenation
 * @param dictionary
 *          the word dictionary to match against.
 * @param minWordSize
 *          only words longer than this get processed
 * @param minSubwordSize
 *          only subwords longer than this get to the output stream
 * @param maxSubwordSize
 *          only subwords shorter than this get to the output stream
 * @param onlyLongestMatch
 *          Add only the longest matching subword to the stream
 */
public HyphenationCompoundWordTokenFilter(TokenStream input,
                                          HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize,
                                          int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
  super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
      onlyLongestMatch);

  this.hyphenator = hyphenator;
}

Example #6

Source File: HyphenationCompoundWordTokenFilter.java From lucene-solr with Apache License 2.0

3 votes

/**
 * Create a HyphenationCompoundWordTokenFilter with no dictionary.
 * <p>
 * Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, org.apache.lucene.analysis.CharArraySet, int, int, int, boolean)
 * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
 * null, minWordSize, minSubwordSize, maxSubwordSize }
 */
public HyphenationCompoundWordTokenFilter(TokenStream input,
                                          HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
                                          int maxSubwordSize) {
  this(input, hyphenator, null, minWordSize, minSubwordSize,
      maxSubwordSize, false);
}

Example #7

Source File: HyphenationCompoundWordTokenFilter.java From lucene-solr with Apache License 2.0

3 votes

/**
 * Create a hyphenator tree
 *
 * @param hyphenationSource the InputSource pointing to the XML grammar
 * @return An object representing the hyphenation patterns
 * @throws java.io.IOException If there is a low-level I/O error.
 */
public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
    throws IOException {
  HyphenationTree tree = new HyphenationTree();
  tree.loadPatterns(hyphenationSource);
  return tree;
}

Example #8

Source File: HyphenationCompoundWordTokenFilter.java From lucene-solr with Apache License 2.0

2 votes

/**
 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
 *
 * @param input
 *          the {@link org.apache.lucene.analysis.TokenStream} to process
 * @param hyphenator
 *          the hyphenation pattern tree to use for hyphenation
 * @param dictionary
 *          the word dictionary to match against.
 */
public HyphenationCompoundWordTokenFilter(TokenStream input,
                                          HyphenationTree hyphenator, CharArraySet dictionary) {
  this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
      DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
}

Example #9

Source File: HyphenationCompoundWordTokenFilter.java From lucene-solr with Apache License 2.0

2 votes

/**
 * Create a HyphenationCompoundWordTokenFilter with no dictionary.
 * <p>
 * Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, int, int, int)
 * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
 * DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
 */
public HyphenationCompoundWordTokenFilter(TokenStream input,
                                          HyphenationTree hyphenator) {
  this(input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
      DEFAULT_MAX_SUBWORD_SIZE);
}

Example #10

Source File: HyphenationCompoundWordTokenFilter.java From lucene-solr with Apache License 2.0

2 votes

/**
 * Create a hyphenator tree
 *
 * @param hyphenationFilename the filename of the XML grammar to load
 * @return An object representing the hyphenation patterns
 * @throws java.io.IOException If there is a low-level I/O error.
 */
public static HyphenationTree getHyphenationTree(String hyphenationFilename)
    throws IOException {
  return getHyphenationTree(new InputSource(hyphenationFilename));
}