Java Code Examples for org.apache.lucene.util.AttributeFactory

The following examples show how to use org.apache.lucene.util.AttributeFactory. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: AdSearch_Endpoints   Source File: QueryParserImpl.java    License: Apache License 2.0 6 votes vote down vote up
@Override
  public List<String> parseQuery(String queryStr) {
    // tokenize queryStr, remove stop word, stemming
	List<String> tokens = new ArrayList<String>();
	AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
	Tokenizer tokenizer = new StandardTokenizer(factory);
	tokenizer.setReader(new StringReader(queryStr));
	CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    TokenStream tokenStream = new StopFilter(tokenizer, stopWords);
//    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
    try {
    	tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            
            tokens.add(term);
//            sb.append(term + " ");
        }
        tokenStream.end();
        tokenStream.close();

        tokenizer.close();  
	} catch (IOException e) {
		e.printStackTrace();
	}
//	System.out.println("QU="+ sb.toString());
	return tokens;	
  }
 
Example 2
public void testHugeTerm2() throws Exception {
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < 40960; i++) {
        sb.append('a');
    }
    String input = sb.toString();
    IcuTokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
            new DefaultIcuTokenizerConfig(false, true));
    tokenizer.setReader(new StringReader(input));
    char token[] = new char[4096];
    Arrays.fill(token, 'a');
    String expectedToken = new String(token);
    String expected[] = {
            expectedToken, expectedToken, expectedToken,
            expectedToken, expectedToken, expectedToken,
            expectedToken, expectedToken, expectedToken,
            expectedToken
    };
    assertTokenStreamContents(tokenizer, expected);
}
 
Example 3
Source Project: lucene-solr   Source File: TestConcatenatingTokenStream.java    License: Apache License 2.0 6 votes vote down vote up
public void testInconsistentAttributes() throws IOException {

    AttributeFactory factory = newAttributeFactory();

    final MockTokenizer first = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    first.setReader(new StringReader("first words "));
    first.addAttribute(PayloadAttribute.class);
    final MockTokenizer second = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    second.setReader(new StringReader("second words"));
    second.addAttribute(FlagsAttribute.class);

    TokenStream ts = new ConcatenatingTokenStream(first, second);
    assertTrue(ts.hasAttribute(FlagsAttribute.class));
    assertTrue(ts.hasAttribute(PayloadAttribute.class));

    assertTokenStreamContents(ts,
        new String[] { "first", "words", "second", "words" },
        new int[]{ 0, 6, 12, 19, },
        new int[]{ 5, 11, 18, 24, });

  }
 
Example 4
Source Project: jstarcraft-nlp   Source File: MMSegTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create(AttributeFactory factory) {
    MmsegTokenizer tokenizer = tokenizerLocal.get();
    if (tokenizer == null) {
        tokenizer = newTokenizer();
    }

    return tokenizer;
}
 
Example 5
Source Project: lucene-solr   Source File: PathHierarchyTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create(AttributeFactory factory) {
  if (reverse) {
    return new ReversePathHierarchyTokenizer(factory, delimiter, replacement, skip);
  }
  return new PathHierarchyTokenizer(factory, delimiter, replacement, skip);
}
 
Example 6
private static Analyzer create() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            return new TokenStreamComponents(new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(true, true)));
        }
    };
}
 
Example 7
private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenFilter filter = new IcuNormalizerFilter(tokenizer,
                    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}
 
Example 8
Source Project: Elasticsearch   Source File: NumericTokenizer.java    License: Apache License 2.0 5 votes vote down vote up
/** Make this tokenizer get attributes from the delegate token stream. */
private static final AttributeFactory delegatingAttributeFactory(final AttributeSource source) {
    return new AttributeFactory() {
        @Override
        public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
            return (AttributeImpl) source.addAttribute(attClass);
        }
    };
}
 
Example 9
Source Project: jate   Source File: OpenNLPTokenizer.java    License: GNU Lesser General Public License v3.0 5 votes vote down vote up
public OpenNLPTokenizer(AttributeFactory factory, SentenceDetector sentenceOp,
                        opennlp.tools.tokenize.Tokenizer tokenizerOp,
                        ParagraphChunker paragraphOp) {
    super(factory);
    termAtt.resizeBuffer(DEFAULT_BUFFER_SIZE);
    if (sentenceOp == null && tokenizerOp == null) {
        throw new IllegalArgumentException("OpenNLPTokenizer: need one or both of Sentence Detector and Tokenizer");
    }
    this.sentenceOp = sentenceOp;
    this.tokenizerOp = tokenizerOp;
    this.paragraphOp = paragraphOp;
}
 
Example 10
Source Project: mtas   Source File: MtasTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public MtasTokenizer create(AttributeFactory factory) {
  MtasTokenizer tokenizer = null;
  try {
    tokenizer = create(factory, null);
  } catch (IOException e) {
    log.error(e);
  }
  return tokenizer;
}
 
Example 11
Source Project: lucene-solr   Source File: JapaneseTokenizer.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Create a new JapaneseTokenizer using the system and unknown dictionaries shipped with Lucene.
 *
 * @param factory the AttributeFactory to use
 * @param userDictionary Optional: if non-null, user dictionary.
 * @param discardPunctuation true if punctuation tokens should be dropped from the output.
 * @param mode tokenization mode.
 */
public JapaneseTokenizer
    (AttributeFactory factory, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
  this(factory,
       TokenInfoDictionary.getInstance(),
       UnknownDictionary.getInstance(),
       ConnectionCosts.getInstance(),
       userDictionary, discardPunctuation, true, mode);
}
 
Example 12
Source Project: mmseg4j-solr   Source File: MMSegTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create(AttributeFactory factory) {
	MMSegTokenizer tokenizer = tokenizerLocal.get();
	if(tokenizer == null) {
		tokenizer = newTokenizer();
	}

	return tokenizer;
}
 
Example 13
Source Project: lucene-solr   Source File: JapaneseTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public JapaneseTokenizer create(AttributeFactory factory) {
  JapaneseTokenizer t = new JapaneseTokenizer(factory, userDictionary, discardPunctuation, discardCompoundToken, mode);
  if (nbestExamples != null) {
    nbestCost = Math.max(nbestCost, t.calcNBestCost(nbestExamples));
  }
  t.setNBestCost(nbestCost);
  return t;
}
 
Example 14
Source Project: lucene-solr   Source File: OpenNLPTokenizer.java    License: Apache License 2.0 5 votes vote down vote up
public OpenNLPTokenizer(AttributeFactory factory, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) throws IOException {
  super(factory, new OpenNLPSentenceBreakIterator(sentenceOp));
  if (sentenceOp == null || tokenizerOp == null) {
    throw new IllegalArgumentException("OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required");
  }
  this.sentenceOp = sentenceOp;
  this.tokenizerOp = tokenizerOp;
}
 
Example 15
Source Project: lucene-solr   Source File: OpenNLPTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public OpenNLPTokenizer create(AttributeFactory factory) {
  try {
    NLPSentenceDetectorOp sentenceOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
    NLPTokenizerOp tokenizerOp = OpenNLPOpsFactory.getTokenizer(tokenizerModelFile);
    return new OpenNLPTokenizer(factory, sentenceOp, tokenizerOp);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}
 
Example 16
@Before
public void up() {
    analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenStream result = new CJKBigramFilter(source);
            return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
        }
    };
}
 
Example 17
Source Project: lucene-solr   Source File: KoreanTokenizer.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * <p>Create a new KoreanTokenizer supplying a custom system dictionary and unknown dictionary.
 * This constructor provides an entry point for users that want to construct custom language models
 * that can be used as input to {@link org.apache.lucene.analysis.ko.util.DictionaryBuilder}.</p>
 *
 * @param factory the AttributeFactory to use
 * @param systemDictionary a custom known token dictionary
 * @param unkDictionary a custom unknown token dictionary
 * @param connectionCosts custom token transition costs
 * @param userDictionary Optional: if non-null, user dictionary.
 * @param mode Decompound mode.
 * @param outputUnknownUnigrams if true outputs unigrams for unknown words.
 * @param discardPunctuation true if punctuation tokens should be dropped from the output.
 * @lucene.experimental
 */
public KoreanTokenizer(AttributeFactory factory,
                       TokenInfoDictionary systemDictionary,
                       UnknownDictionary unkDictionary,
                       ConnectionCosts connectionCosts,
                       UserDictionary userDictionary,
                       DecompoundMode mode,
                       boolean outputUnknownUnigrams,
                       boolean discardPunctuation) {
  super(factory);
  this.dictionary = systemDictionary;
  this.fst = dictionary.getFST();
  this.unkDictionary = unkDictionary;
  this.characterDefinition = unkDictionary.getCharacterDefinition();
  this.costs = connectionCosts;
  this.userDictionary = userDictionary;
  fstReader = fst.getBytesReader();
  if (userDictionary != null) {
    userFST = userDictionary.getFST();
    userFSTReader = userFST.getBytesReader();
  } else {
    userFST = null;
    userFSTReader = null;
  }
  this.mode = mode;
  this.outputUnknownUnigrams = outputUnknownUnigrams;
  this.discardPunctuation = discardPunctuation;
  buffer.reset(this.input);

  resetState();

  dictionaryMap.put(Type.KNOWN, dictionary);
  dictionaryMap.put(Type.UNKNOWN, unkDictionary);
  dictionaryMap.put(Type.USER, userDictionary);
}
 
Example 18
Source Project: lucene-solr   Source File: SimplePatternSplitTokenizer.java    License: Apache License 2.0 5 votes vote down vote up
/** Runs a pre-built automaton. */
public SimplePatternSplitTokenizer(AttributeFactory factory, Automaton dfa) {
  super(factory);

  // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
  // realizing this ctor is otherwise trappy
  if (dfa.isDeterministic() == false) {
    throw new IllegalArgumentException("please determinize the incoming automaton first");
  }

  runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
 
Example 19
Source Project: lucene-solr   Source File: TestConcatenatingTokenStream.java    License: Apache License 2.0 5 votes vote down vote up
public void testBasic() throws IOException {

    AttributeFactory factory = newAttributeFactory();

    final MockTokenizer first = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    first.setReader(new StringReader("first words "));
    final MockTokenizer second = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    second.setReader(new StringReader("second words"));
    final MockTokenizer third = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    third.setReader(new StringReader(" third words"));

    TokenStream ts = new ConcatenatingTokenStream(first, second, new EmptyTokenStream(), third);
    assertTokenStreamContents(ts,
        new String[] { "first", "words", "second", "words", "third", "words" },
        new int[]{ 0, 6, 12, 19, 25, 31 },
        new int[]{ 5, 11, 18, 24, 30, 36 });

    // test re-use
    first.setReader(new StringReader("first words "));
    second.setReader(new StringReader("second words"));
    third.setReader(new StringReader(" third words"));
    assertTokenStreamContents(ts,
        new String[] { "first", "words", "second", "words", "third", "words" },
        new int[]{ 0, 6, 12, 19, 25, 31 },
        new int[]{ 5, 11, 18, 24, 30, 36 },
        new int[]{ 1, 1, 1, 1, 1, 1 });

  }
 
Example 20
Source Project: lucene-solr   Source File: Test2BTerms.java    License: Apache License 2.0 5 votes vote down vote up
public MyTokenStream(Random random, int tokensPerDoc) {
  super(new MyAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY));
  this.tokensPerDoc = tokensPerDoc;
  addAttribute(TermToBytesRefAttribute.class);
  bytes.length = TOKEN_LEN;
  this.random = random;
  nextSave = TestUtil.nextInt(random, 500000, 1000000);
}
 
Example 21
Source Project: lucene-solr   Source File: SimplePatternTokenizer.java    License: Apache License 2.0 5 votes vote down vote up
/** Runs a pre-built automaton. */
public SimplePatternTokenizer(AttributeFactory factory, Automaton dfa) {
  super(factory);

  // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
  // realizing this ctor is otherwise trappy
  if (dfa.isDeterministic() == false) {
    throw new IllegalArgumentException("please determinize the incoming automaton first");
  }

  runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
 
Example 22
public void testHugeDoc() throws Exception {
    StringBuilder sb = new StringBuilder();
    char whitespace[] = new char[4094];
    Arrays.fill(whitespace, ' ');
    sb.append(whitespace);
    sb.append("testing 1234");
    String input = sb.toString();
    IcuTokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
            new DefaultIcuTokenizerConfig(false, true));
    tokenizer.setReader(new StringReader(input));
    assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
 
Example 23
Source Project: lucene-solr   Source File: ThaiTokenizer.java    License: Apache License 2.0 5 votes vote down vote up
/** Creates a new ThaiTokenizer, supplying the AttributeFactory */
public ThaiTokenizer(AttributeFactory factory) {
  super(factory, (BreakIterator)sentenceProto.clone());
  if (!DBBI_AVAILABLE) {
    throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation");
  }
  wordBreaker = (BreakIterator)proto.clone();
}
 
Example 24
Source Project: lucene-solr   Source File: WhitespaceTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create(AttributeFactory factory) {
  switch (rule) {
    case RULE_JAVA:
      return new WhitespaceTokenizer(factory, maxTokenLen);
    case RULE_UNICODE:
      return new UnicodeWhitespaceTokenizer(factory, maxTokenLen);
    default:
      throw new AssertionError();
  }
}
 
Example 25
Source Project: jstarcraft-nlp   Source File: AnsjTokenizerFactory.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Tokenizer create(AttributeFactory factory) {
    return AnsjAnalyzer.getTokenizer(null, getOriginalArgs());
}
 
Example 26
Source Project: jstarcraft-nlp   Source File: CoreNlpTokenizerFactory.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Tokenizer create(AttributeFactory factory) {
    return new CoreNlpTokenizer(factory, pipeline);
}
 
Example 27
Source Project: jstarcraft-nlp   Source File: CoreNlpTokenizer.java    License: Apache License 2.0 4 votes vote down vote up
public CoreNlpTokenizer(AttributeFactory factory, AnnotationPipeline pipeline) {
    super(factory);
    this.pipeline = pipeline;
}
 
Example 28
Source Project: jstarcraft-nlp   Source File: NlpSegmenter.java    License: Apache License 2.0 4 votes vote down vote up
public NlpSegmenter(AttributeFactory factory, BreakIterator iterator, NlpTokenizer<? extends NlpToken> tokenizer) {
    super(factory, iterator);
    this.tokenizer = tokenizer;
}
 
Example 29
Source Project: jstarcraft-nlp   Source File: JiebaTokenizer.java    License: Apache License 2.0 4 votes vote down vote up
/** Creates a new JiebaTokenizer, supplying the AttributeFactory */
public JiebaTokenizer(JiebaSegmenter.SegMode segMode, AttributeFactory factory) {
    super(factory, (BreakIterator) sentenceProto.clone());
    this.segMode = segMode;
}
 
Example 30
Source Project: lucene-solr   Source File: TokenStream.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances.
 */
protected TokenStream(AttributeFactory factory) {
  super(factory);
  assert assertFinal();
}