org.apache.lucene.util.AttributeFactory Java Examples

The following examples show how to use org.apache.lucene.util.AttributeFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestConcatenatingTokenStream.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testInconsistentAttributes() throws IOException {

    AttributeFactory factory = newAttributeFactory();

    final MockTokenizer first = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    first.setReader(new StringReader("first words "));
    first.addAttribute(PayloadAttribute.class);
    final MockTokenizer second = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    second.setReader(new StringReader("second words"));
    second.addAttribute(FlagsAttribute.class);

    TokenStream ts = new ConcatenatingTokenStream(first, second);
    assertTrue(ts.hasAttribute(FlagsAttribute.class));
    assertTrue(ts.hasAttribute(PayloadAttribute.class));

    assertTokenStreamContents(ts,
        new String[] { "first", "words", "second", "words" },
        new int[]{ 0, 6, 12, 19, },
        new int[]{ 5, 11, 18, 24, });

  }
 
Example #2
Source File: SegmentationIcuTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testHugeTerm2() throws Exception {
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < 40960; i++) {
        sb.append('a');
    }
    String input = sb.toString();
    IcuTokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
            new DefaultIcuTokenizerConfig(false, true));
    tokenizer.setReader(new StringReader(input));
    char token[] = new char[4096];
    Arrays.fill(token, 'a');
    String expectedToken = new String(token);
    String expected[] = {
            expectedToken, expectedToken, expectedToken,
            expectedToken, expectedToken, expectedToken,
            expectedToken, expectedToken, expectedToken,
            expectedToken
    };
    assertTokenStreamContents(tokenizer, expected);
}
 
Example #3
Source File: QueryParserImpl.java    From AdSearch_Endpoints with Apache License 2.0 6 votes vote down vote up
@Override
  public List<String> parseQuery(String queryStr) {
    // tokenize queryStr, remove stop word, stemming
	List<String> tokens = new ArrayList<String>();
	AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
	Tokenizer tokenizer = new StandardTokenizer(factory);
	tokenizer.setReader(new StringReader(queryStr));
	CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    TokenStream tokenStream = new StopFilter(tokenizer, stopWords);
//    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
    try {
    	tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            
            tokens.add(term);
//            sb.append(term + " ");
        }
        tokenStream.end();
        tokenStream.close();

        tokenizer.close();  
	} catch (IOException e) {
		e.printStackTrace();
	}
//	System.out.println("QU="+ sb.toString());
	return tokens;	
  }
 
Example #4
Source File: MMSegTokenizerFactory.java    From mmseg4j-solr with Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create(AttributeFactory factory) {
	MMSegTokenizer tokenizer = tokenizerLocal.get();
	if(tokenizer == null) {
		tokenizer = newTokenizer();
	}

	return tokenizer;
}
 
Example #5
Source File: NumericTokenizer.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
/** Make this tokenizer get attributes from the delegate token stream. */
private static final AttributeFactory delegatingAttributeFactory(final AttributeSource source) {
    return new AttributeFactory() {
        @Override
        public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
            return (AttributeImpl) source.addAttribute(attClass);
        }
    };
}
 
Example #6
Source File: PathHierarchyTokenizerFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create(AttributeFactory factory) {
  if (reverse) {
    return new ReversePathHierarchyTokenizer(factory, delimiter, replacement, skip);
  }
  return new PathHierarchyTokenizer(factory, delimiter, replacement, skip);
}
 
Example #7
Source File: IcuTokenizerCJKTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
private static Analyzer create() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            return new TokenStreamComponents(new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(true, true)));
        }
    };
}
 
Example #8
Source File: OpenNLPTokenizer.java    From jate with GNU Lesser General Public License v3.0 5 votes vote down vote up
public OpenNLPTokenizer(AttributeFactory factory, SentenceDetector sentenceOp,
                        opennlp.tools.tokenize.Tokenizer tokenizerOp,
                        ParagraphChunker paragraphOp) {
    super(factory);
    termAtt.resizeBuffer(DEFAULT_BUFFER_SIZE);
    if (sentenceOp == null && tokenizerOp == null) {
        throw new IllegalArgumentException("OpenNLPTokenizer: need one or both of Sentence Detector and Tokenizer");
    }
    this.sentenceOp = sentenceOp;
    this.tokenizerOp = tokenizerOp;
    this.paragraphOp = paragraphOp;
}
 
Example #9
Source File: SegmentationIcuTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenFilter filter = new IcuNormalizerFilter(tokenizer,
                    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}
 
Example #10
Source File: MtasTokenizerFactory.java    From mtas with Apache License 2.0 5 votes vote down vote up
@Override
public MtasTokenizer create(AttributeFactory factory) {
  MtasTokenizer tokenizer = null;
  try {
    tokenizer = create(factory, null);
  } catch (IOException e) {
    log.error(e);
  }
  return tokenizer;
}
 
Example #11
Source File: JapaneseTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Create a new JapaneseTokenizer using the system and unknown dictionaries shipped with Lucene.
 *
 * @param factory the AttributeFactory to use
 * @param userDictionary Optional: if non-null, user dictionary.
 * @param discardPunctuation true if punctuation tokens should be dropped from the output.
 * @param mode tokenization mode.
 */
public JapaneseTokenizer
    (AttributeFactory factory, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
  this(factory,
       TokenInfoDictionary.getInstance(),
       UnknownDictionary.getInstance(),
       ConnectionCosts.getInstance(),
       userDictionary, discardPunctuation, true, mode);
}
 
Example #12
Source File: MMSegTokenizerFactory.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create(AttributeFactory factory) {
    MmsegTokenizer tokenizer = tokenizerLocal.get();
    if (tokenizer == null) {
        tokenizer = newTokenizer();
    }

    return tokenizer;
}
 
Example #13
Source File: JapaneseTokenizerFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public JapaneseTokenizer create(AttributeFactory factory) {
  JapaneseTokenizer t = new JapaneseTokenizer(factory, userDictionary, discardPunctuation, discardCompoundToken, mode);
  if (nbestExamples != null) {
    nbestCost = Math.max(nbestCost, t.calcNBestCost(nbestExamples));
  }
  t.setNBestCost(nbestCost);
  return t;
}
 
Example #14
Source File: OpenNLPTokenizerFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public OpenNLPTokenizer create(AttributeFactory factory) {
  try {
    NLPSentenceDetectorOp sentenceOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
    NLPTokenizerOp tokenizerOp = OpenNLPOpsFactory.getTokenizer(tokenizerModelFile);
    return new OpenNLPTokenizer(factory, sentenceOp, tokenizerOp);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}
 
Example #15
Source File: WhitespaceTokenizerFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create(AttributeFactory factory) {
  switch (rule) {
    case RULE_JAVA:
      return new WhitespaceTokenizer(factory, maxTokenLen);
    case RULE_UNICODE:
      return new UnicodeWhitespaceTokenizer(factory, maxTokenLen);
    default:
      throw new AssertionError();
  }
}
 
Example #16
Source File: ThaiTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Creates a new ThaiTokenizer, supplying the AttributeFactory */
public ThaiTokenizer(AttributeFactory factory) {
  super(factory, (BreakIterator)sentenceProto.clone());
  if (!DBBI_AVAILABLE) {
    throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation");
  }
  wordBreaker = (BreakIterator)proto.clone();
}
 
Example #17
Source File: SegmentationIcuTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testHugeDoc() throws Exception {
    StringBuilder sb = new StringBuilder();
    char whitespace[] = new char[4094];
    Arrays.fill(whitespace, ' ');
    sb.append(whitespace);
    sb.append("testing 1234");
    String input = sb.toString();
    IcuTokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
            new DefaultIcuTokenizerConfig(false, true));
    tokenizer.setReader(new StringReader(input));
    assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
 
Example #18
Source File: SimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Runs a pre-built automaton. */
public SimplePatternTokenizer(AttributeFactory factory, Automaton dfa) {
  super(factory);

  // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
  // realizing this ctor is otherwise trappy
  if (dfa.isDeterministic() == false) {
    throw new IllegalArgumentException("please determinize the incoming automaton first");
  }

  runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
 
Example #19
Source File: Test2BTerms.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public MyTokenStream(Random random, int tokensPerDoc) {
  super(new MyAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY));
  this.tokensPerDoc = tokensPerDoc;
  addAttribute(TermToBytesRefAttribute.class);
  bytes.length = TOKEN_LEN;
  this.random = random;
  nextSave = TestUtil.nextInt(random, 500000, 1000000);
}
 
Example #20
Source File: TestConcatenatingTokenStream.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testBasic() throws IOException {

    AttributeFactory factory = newAttributeFactory();

    final MockTokenizer first = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    first.setReader(new StringReader("first words "));
    final MockTokenizer second = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    second.setReader(new StringReader("second words"));
    final MockTokenizer third = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    third.setReader(new StringReader(" third words"));

    TokenStream ts = new ConcatenatingTokenStream(first, second, new EmptyTokenStream(), third);
    assertTokenStreamContents(ts,
        new String[] { "first", "words", "second", "words", "third", "words" },
        new int[]{ 0, 6, 12, 19, 25, 31 },
        new int[]{ 5, 11, 18, 24, 30, 36 });

    // test re-use
    first.setReader(new StringReader("first words "));
    second.setReader(new StringReader("second words"));
    third.setReader(new StringReader(" third words"));
    assertTokenStreamContents(ts,
        new String[] { "first", "words", "second", "words", "third", "words" },
        new int[]{ 0, 6, 12, 19, 25, 31 },
        new int[]{ 5, 11, 18, 24, 30, 36 },
        new int[]{ 1, 1, 1, 1, 1, 1 });

  }
 
Example #21
Source File: SimplePatternSplitTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Runs a pre-built automaton. */
public SimplePatternSplitTokenizer(AttributeFactory factory, Automaton dfa) {
  super(factory);

  // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
  // realizing this ctor is otherwise trappy
  if (dfa.isDeterministic() == false) {
    throw new IllegalArgumentException("please determinize the incoming automaton first");
  }

  runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
 
Example #22
Source File: KoreanTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * <p>Create a new KoreanTokenizer supplying a custom system dictionary and unknown dictionary.
 * This constructor provides an entry point for users that want to construct custom language models
 * that can be used as input to {@link org.apache.lucene.analysis.ko.util.DictionaryBuilder}.</p>
 *
 * @param factory the AttributeFactory to use
 * @param systemDictionary a custom known token dictionary
 * @param unkDictionary a custom unknown token dictionary
 * @param connectionCosts custom token transition costs
 * @param userDictionary Optional: if non-null, user dictionary.
 * @param mode Decompound mode.
 * @param outputUnknownUnigrams if true outputs unigrams for unknown words.
 * @param discardPunctuation true if punctuation tokens should be dropped from the output.
 * @lucene.experimental
 */
public KoreanTokenizer(AttributeFactory factory,
                       TokenInfoDictionary systemDictionary,
                       UnknownDictionary unkDictionary,
                       ConnectionCosts connectionCosts,
                       UserDictionary userDictionary,
                       DecompoundMode mode,
                       boolean outputUnknownUnigrams,
                       boolean discardPunctuation) {
  super(factory);
  this.dictionary = systemDictionary;
  this.fst = dictionary.getFST();
  this.unkDictionary = unkDictionary;
  this.characterDefinition = unkDictionary.getCharacterDefinition();
  this.costs = connectionCosts;
  this.userDictionary = userDictionary;
  fstReader = fst.getBytesReader();
  if (userDictionary != null) {
    userFST = userDictionary.getFST();
    userFSTReader = userFST.getBytesReader();
  } else {
    userFST = null;
    userFSTReader = null;
  }
  this.mode = mode;
  this.outputUnknownUnigrams = outputUnknownUnigrams;
  this.discardPunctuation = discardPunctuation;
  buffer.reset(this.input);

  resetState();

  dictionaryMap.put(Type.KNOWN, dictionary);
  dictionaryMap.put(Type.UNKNOWN, unkDictionary);
  dictionaryMap.put(Type.USER, userDictionary);
}
 
Example #23
Source File: CJKBigramFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
@Before
public void up() {
    analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenStream result = new CJKBigramFilter(source);
            return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
        }
    };
}
 
Example #24
Source File: OpenNLPTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public OpenNLPTokenizer(AttributeFactory factory, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) throws IOException {
  super(factory, new OpenNLPSentenceBreakIterator(sentenceOp));
  if (sentenceOp == null || tokenizerOp == null) {
    throw new IllegalArgumentException("OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required");
  }
  this.sentenceOp = sentenceOp;
  this.tokenizerOp = tokenizerOp;
}
 
Example #25
Source File: NGramTokenizer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
NGramTokenizer(AttributeFactory factory, int minGram, int maxGram, boolean edgesOnly) {
  super(factory);
  init(minGram, maxGram, edgesOnly);
}
 
Example #26
Source File: ClassicTokenizer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * Creates a new ClassicTokenizer with a given {@link org.apache.lucene.util.AttributeFactory} 
 */
public ClassicTokenizer(AttributeFactory factory) {
  super(factory);
  init();
}
 
Example #27
Source File: FieldAnalysisRequestHandlerTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public CustomTokenizer(AttributeFactory factory) {
  super(factory);
  addAttributeImpl(new CustomFlagsAttributeImpl());
  charAtt = addAttribute(CharTermAttribute.class);
  customAtt = addAttribute(FlagsAttribute.class);
}
 
Example #28
Source File: ExactTokenizerFactory.java    From crushpaper with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
public Tokenizer create(AttributeFactory factory, Reader input) {
	return new ExactTokenizer(factory, input);
}
 
Example #29
Source File: WikipediaTokenizerFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public WikipediaTokenizer create(AttributeFactory factory) {
  return new WikipediaTokenizer(factory, tokenOutput, untokenizedTypes);
}
 
Example #30
Source File: MockBytesAnalyzer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected AttributeFactory attributeFactory(String fieldName) {
  return MockUTF16TermAttributeImpl.UTF16_TERM_ATTRIBUTE_FACTORY;
}