Java Code Examples for org.apache.lucene.analysis.Analyzer#tokenStream()

The following examples show how to use org.apache.lucene.analysis.Analyzer#tokenStream() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: NGramSynonymTokenizerTest.java    From elasticsearch-analysis-synonym with Apache License 2.0 6 votes vote down vote up
@Test
public void testAfterStrSingleSynonymExpand2() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("ab"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abb"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bb,1,3,0");

  a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcd"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bc,1,3,0/cd,2,4,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcde"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bc,1,3,0/cd,2,4,1/de,3,5,1");
}
 
Example 2
Source File: TestConditionalTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testReadaheadWithNoFiltering() throws IOException {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new ClassicTokenizer();
      TokenStream sink = new ConditionalTokenFilter(source, in -> new ShingleFilter(in, 2)) {
        @Override
        protected boolean shouldFilter() throws IOException {
          return true;
        }
      };
      return new TokenStreamComponents(source, sink);
    }
  };

  String input = "one two three four";

  try (TokenStream ts = analyzer.tokenStream("", input)) {
    assertTokenStreamContents(ts, new String[]{
        "one", "one two",
        "two", "two three",
        "three", "three four",
        "four"
    });
  }
}
 
Example 3
Source File: ChineseWordAnalyzerTest.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void test1() {
    try {
        Analyzer analyzer = new WordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "杨尚川是APDPlat应用级产品开发平台的作者");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[杨尚川, 是, apdplat, 应用级, 产品, 开发, 平台, 的, 作者]";
        if ("bigram".equals(WordConfTools.get("ngram", "bigram"))) {
            expResult = "[杨尚川, 是, apdplat, 应用, 级, 产品, 开发, 平台, 的, 作者]";
        }
        Assert.assertEquals(expResult, words.toString());
    } catch (IOException e) {
        Assert.fail("分词出错" + e.getMessage());
    }
}
 
Example 4
Source File: DefaultQueryBuilder.java    From modernmt with Apache License 2.0 6 votes vote down vote up
private static void loadTerms(String fieldName, Sentence sentence, Analyzer analyzer, BooleanQuery output) {
    final int maxClauseCount = BooleanQuery.getMaxClauseCount();
    String text = TokensOutputStream.serialize(sentence, false, true);

    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(fieldName, text);
        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

        int count = 0;

        stream.reset();
        while (stream.incrementToken() && (count + 1) < maxClauseCount) {
            Term term = new Term(fieldName, charTermAttribute.toString());
            output.add(new TermQuery(term), BooleanClause.Occur.SHOULD);
            count++;
        }
    } catch (IOException e) {
        throw new Error("This should never happen", e);
    } finally {
        closeQuietly(stream);
    }
}
 
Example 5
Source File: NGramSynonymTokenizerTest.java    From elasticsearch-analysis-synonym with Apache License 2.0 6 votes vote down vote up
@Test
public void testAfterStrSingleSynonym2() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("ab"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/b,1,2,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa");
  stream = a.tokenStream("f", new StringReader("abb"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/bb,1,3,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcd"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/bc,1,3,1/cd,2,4,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcde"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/bc,1,3,1/cd,2,4,1/de,3,5,1");
}
 
Example 6
Source File: IcuCollationAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testNormalization() throws Exception {
    Settings settings = Settings.builder()
            .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation")
            .put("index.analysis.analyzer.myAnalyzer.language", "tr")
            .put("index.analysis.analyzer.myAnalyzer.strength", "primary")
            .put("index.analysis.analyzer.myAnalyzer.decomposition", "canonical")
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Analyzer analyzer = analysis.indexAnalyzers.get("myAnalyzer");
    TokenStream tsUpper = analyzer.tokenStream(null, "I W\u0049\u0307LL USE TURKİSH CASING");
    BytesRef b1 = bytesFromTokenStream(tsUpper);
    TokenStream tsLower = analyzer.tokenStream(null, "ı will use turkish casıng");
    BytesRef b2 = bytesFromTokenStream(tsLower);
    assertTrue(compare(b1.bytes, b2.bytes) == 0);
}
 
Example 7
Source File: NGramSynonymTokenizerTest.java    From elasticsearch-analysis-synonym with Apache License 2.0 6 votes vote down vote up
@Test
public void testSingleSynonym() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(1, false, "a,aa,aaa");
  TokenStream stream = a.tokenStream("f", new StringReader("a"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1");

  a = new NGramSynonymTokenizerTestAnalyzer(1, false, "a,aa,aaa");
  stream = a.tokenStream("f", new StringReader("aa"));
  stream.reset();
  assertTokenStream(stream, "aa,0,2,1");

  a = new NGramSynonymTokenizerTestAnalyzer(1, false, "a,aa,aaa");
  stream = a.tokenStream("f", new StringReader("aaa"));
  stream.reset();
  assertTokenStream(stream, "aaa,0,3,1");

  a = new NGramSynonymTokenizerTestAnalyzer(1, false, "a");
  stream = a.tokenStream("f", new StringReader("a"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1");
}
 
Example 8
Source File: IcuCollationAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testIgnoreWhitespace() throws Exception {
    Settings settings = Settings.builder()
            .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation")
            .put("index.analysis.analyzer.myAnalyzer.language", "en")
            .put("index.analysis.analyzer.myAnalyzer.strength", "primary")
            .put("index.analysis.analyzer.myAnalyzer.alternate", "shifted")
            .put("index.analysis.analyzer.myAnalyzer.variableTop", 4096) // SPACE
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Analyzer analyzer = analysis.indexAnalyzers.get("myAnalyzer");
    TokenStream tsWithoutSpace = analyzer.tokenStream(null, "foobar");
    BytesRef b1 = bytesFromTokenStream(tsWithoutSpace);
    TokenStream tsWithSpace = analyzer.tokenStream(null, "foo bar");
    BytesRef b2 = bytesFromTokenStream(tsWithSpace);
    assertTrue(compare(b1.bytes, b2.bytes) == 0);

    // now check that punctuation still matters: foo-bar < foo bar
    TokenStream tsWithPunctuation = analyzer.tokenStream(null, "foo-bar");
    BytesRef b3 = bytesFromTokenStream(tsWithPunctuation);
    assertTrue(compare(b3.bytes, b1.bytes) < 0);
}
 
Example 9
Source File: NGramSynonymTokenizerTest.java    From elasticsearch-analysis-synonym with Apache License 2.0 6 votes vote down vote up
@Test
public void testComplex2() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa/b,bb");
  TokenStream stream = a.tokenStream("f", new StringReader("cabca"));
  stream.reset();
  assertTokenStream(stream, "c,0,1,1/a,1,2,1/b,2,3,1/c,3,4,1/a,4,5,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa/b,bb");
  stream = a.tokenStream("f", new StringReader("ccabcca"));
  stream.reset();
  assertTokenStream(stream, "cc,0,2,1/a,2,3,1/b,3,4,1/cc,4,6,1/a,6,7,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa/b,bb");
  stream = a.tokenStream("f", new StringReader("edcabcdea"));
  stream.reset();
  assertTokenStream(stream, "ed,0,2,1/dc,1,3,1/a,3,4,1/b,4,5,1/cd,5,7,1/de,6,8,1/a");

  a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa/b,bb");
  stream = a.tokenStream("f", new StringReader("fedcabcdefa"));
  stream.reset();
  assertTokenStream(stream, "fe,0,2,1/ed,1,3,1/dc,2,4,1/a,4,5,1/b,5,6,1/cd,6,8,1/de,7,9,1/ef,8,10,1/a,10,11,1");
}
 
Example 10
Source File: NGramSynonymTokenizerTest.java    From elasticsearch-analysis-synonym with Apache License 2.0 6 votes vote down vote up
@Test
public void testSandwichStrExpand1() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(1, true, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("aba"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/a,2,3,1/aa,2,3,0");

  a = new NGramSynonymTokenizerTestAnalyzer(1, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abba"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/b,2,3,1/a,3,4,1/aa,3,4,0");

  a = new NGramSynonymTokenizerTestAnalyzer(1, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcda"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/c,2,3,1/d,3,4,1/a,4,5,1/aa,4,5,0");

  a = new NGramSynonymTokenizerTestAnalyzer(1, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcdea"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/c,2,3,1/d,3,4,1/e,4,5,1/a,5,6,1/aa,5,6,0");
}
 
Example 11
Source File: NGramSynonymTokenizerTest.java    From elasticsearch-analysis-synonym with Apache License 2.0 5 votes vote down vote up
@Test
public void testPrevStrSingleSynonymExpand4() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(4, true, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("ba"));
  stream.reset();
  assertTokenStream(stream, "b,0,1,1/a,1,2,1/aa,1,2,0");

  a = new NGramSynonymTokenizerTestAnalyzer(4, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("bba"));
  stream.reset();
  assertTokenStream(stream, "bb,0,2,1/b,1,2,0/a,2,3,1/aa,2,3,0");

  a = new NGramSynonymTokenizerTestAnalyzer(4, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("dcba"));
  stream.reset();
  assertTokenStream(stream, "dcb,0,3,1/cb,1,3,0/b,2,3,0/a,3,4,1/aa,3,4,0");

  a = new NGramSynonymTokenizerTestAnalyzer(4, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("edcba"));
  stream.reset();
  assertTokenStream(stream, "edcb,0,4,1/dcb,1,4,0/cb,2,4,0/b,3,4,0/a,4,5,1/aa,4,5,0");

  a = new NGramSynonymTokenizerTestAnalyzer(4, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("fedcba"));
  stream.reset();
  assertTokenStream(stream, "fedc,0,4,1/edcb,1,5,1/dcb,2,5,0/cb,3,5,0/b,4,5,0/a,5,6,1/aa,5,6,0");
}
 
Example 12
Source File: TokenCountFieldMapper.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
/**
 * Count position increments in a token stream.  Package private for testing.
 * @param analyzer analyzer to create token stream
 * @param fieldName field name to pass to analyzer
 * @param fieldValue field value to pass to analyzer
 * @return number of position increments in a token stream
 * @throws IOException if tokenStream throws it
 */
static int countPositions(Analyzer analyzer, String fieldName, String fieldValue) throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, fieldValue)) {
        int count = 0;
        PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            count += position.getPositionIncrement();
        }
        tokenStream.end();
        count += position.getPositionIncrement();
        return count;
    }
}
 
Example 13
Source File: NGramSynonymTokenizerTest.java    From elasticsearch-analysis-synonym with Apache License 2.0 5 votes vote down vote up
@Test
public void testComplexExpand4() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(4, true, "a,aa/b,bb");
  TokenStream stream = a.tokenStream("f", new StringReader("cabca"));
  stream.reset();
  assertTokenStream(stream, "c,0,1,1/a,1,2,1/aa,1,2,0/b,2,3,1/bb,2,3,0/c,3,4,1/a,4,5,1/aa,4,5,0");

  a = new NGramSynonymTokenizerTestAnalyzer(4, true, "a,aa/b,bb");
  stream = a.tokenStream("f", new StringReader("ccabcca"));
  stream.reset();
  assertTokenStream(stream, "cc,0,2,1/c,1,2,0/a,2,3,1/aa,2,3,0/b,3,4,1/bb,3,4,0/c,4,5,1/cc,4,6,0/c,5,6,0/a,6,7,1/aa,6,7,0");

  a = new NGramSynonymTokenizerTestAnalyzer(4, true, "a,aa/b,bb");
  stream = a.tokenStream("f", new StringReader("edcabcdea"));
  stream.reset();
  assertTokenStream(stream, "edc,0,3,1/dc,1,3,0/c,2,3,0/a,3,4,1/aa,3,4,0/b,4,5,1/bb,4,5,0/c,5,6,1/cd,5,7,0/cde,5,8,0/de,6,8,0/e,7,8,0/a,8,9,1/aa,8,9,0");

  a = new NGramSynonymTokenizerTestAnalyzer(4, true, "a,aa/b,bb");
  stream = a.tokenStream("f", new StringReader("fedcabcdefa"));
  stream.reset();
  assertTokenStream(stream, "fedc,0,4,1/edc,1,4,0/dc,2,4,0/c,3,4,0/a,4,5,1/aa,4,5,0/b,5,6,1/bb,5,6,0/c,6,7,1/cd,6,8,0/cde,6,9,0/cdef,6,10,0/def,7,10,0/ef,8,10,0/f,9,10,0/a,10,11,1/aa,10,11,0");

  a = new NGramSynonymTokenizerTestAnalyzer(4, true, "a,aa/b,bb");
  stream = a.tokenStream("f", new StringReader("gfedcabcdefga"));
  stream.reset();
  assertTokenStream(stream, "gfed,0,4,1/fedc,1,5,1/edc,2,5,0/dc,3,5,0/c,4,5,0/a,5,6,1/aa,5,6,0/b,6,7,1/bb,6,7,0/c,7,8,1/cd,7,9,0/cde,7,10,0/cdef,7,11,0/defg,8,12,1/efg,9,12,0/fg,10,12,0/g,11,12,0/a,12,13,1/aa,12,13,0");
}
 
Example 14
Source File: Field.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
  if (fieldType().indexOptions() == IndexOptions.NONE) {
    // Not indexed
    return null;
  }

  if (!fieldType().tokenized()) {
    if (stringValue() != null) {
      if (!(reuse instanceof StringTokenStream)) {
        // lazy init the TokenStream as it is heavy to instantiate
        // (attributes,...) if not needed
        reuse = new StringTokenStream();
      }
      ((StringTokenStream) reuse).setValue(stringValue());
      return reuse;
    } else if (binaryValue() != null) {
      if (!(reuse instanceof BinaryTokenStream)) {
        // lazy init the TokenStream as it is heavy to instantiate
        // (attributes,...) if not needed
        reuse = new BinaryTokenStream();
      }
      ((BinaryTokenStream) reuse).setValue(binaryValue());
      return reuse;
    } else {
      throw new IllegalArgumentException("Non-Tokenized Fields must have a String value");
    }
  }

  if (tokenStream != null) {
    return tokenStream;
  } else if (readerValue() != null) {
    return analyzer.tokenStream(name(), readerValue());
  } else if (stringValue() != null) {
    return analyzer.tokenStream(name(), stringValue());
  }

  throw new IllegalArgumentException("Field must have either TokenStream, String, Reader or Number value; got " + this);
}
 
Example 15
Source File: IKAnalyzerUtils.java    From emotional_analysis with Apache License 2.0 5 votes vote down vote up
public static List<String> printAnalysisResult(Analyzer analyzer, String keyWord)  
        throws Exception {  
    TokenStream tokenStream = analyzer.tokenStream("content",  
            new StringReader(keyWord));
    tokenStream.reset();
    tokenStream.addAttribute(CharTermAttribute.class);
    ArrayList<String> keys = new ArrayList<>();
    while (tokenStream.incrementToken()) {  
        CharTermAttribute charTermAttribute = tokenStream  
                .getAttribute(CharTermAttribute.class);  
        keys.add(charTermAttribute.toString());
    }  
    return keys;
}
 
Example 16
Source File: IcuCollationAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testCustomRules() throws Exception {
    RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
    String DIN5007_2_tailorings =
            "& ae , a\u0308 & AE , A\u0308& oe , o\u0308 & OE , O\u0308& ue , u\u0308 & UE , u\u0308";

    RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
    String tailoredRules = tailoredCollator.getRules();

    Settings settings = Settings.builder()
            .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation")
            .put("index.analysis.analyzer.myAnalyzer.rules", tailoredRules)
            .put("index.analysis.analyzer.myAnalyzer.strength", "primary")
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Analyzer analyzer = analysis.indexAnalyzers.get("myAnalyzer");

    String germanUmlaut = "Töne";
    TokenStream tsUmlaut = analyzer.tokenStream(null, germanUmlaut);
    BytesRef b1 = bytesFromTokenStream(tsUmlaut);

    String germanExpandedUmlaut = "Toene";
    TokenStream tsExpanded = analyzer.tokenStream(null, germanExpandedUmlaut);
    BytesRef b2 = bytesFromTokenStream(tsExpanded);

    assertTrue(compare(b1.bytes, b2.bytes) == 0);
}
 
Example 17
Source File: TestKoreanNumberFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void analyze(Analyzer analyzer, Reader reader, Writer writer) throws IOException {
  TokenStream stream = analyzer.tokenStream("dummy", reader);
  stream.reset();

  CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);

  while (stream.incrementToken()) {
    writer.write(termAttr.toString());
    writer.write("\n");
  }

  reader.close();
  writer.close();
}
 
Example 18
Source File: TestToken.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {

//        SynonymsLibrary.put(SynonymsLibrary.DEFAULT, "../../library/synonyms.dic");
//
//        DicLibrary.insert(DicLibrary.DEFAULT, "清华", "n", 2000);
//        DicLibrary.insert(DicLibrary.DEFAULT, "大学", "n", 2000);

        Map<String, String> map = new HashMap<String, String>();

        map.put("type", "base_ansj");
//        map.put(SynonymsLibrary.DEFAULT, SynonymsLibrary.DEFAULT);

        Analyzer ca = new AnsjAnalyzer(map);

        String content = "我爱北京天安门天安门上太阳升我美丽的清华大学";

        try {
            TokenStream tokenStream = ca.tokenStream(content, new StringReader(content));

            while (tokenStream.incrementToken()) {

                System.out.print(tokenStream.getAttribute(CharTermAttribute.class));
                System.out.print("\t");
                System.out.print(tokenStream.getAttribute(OffsetAttribute.class).startOffset());
                System.out.print("\t");
                System.out.print(tokenStream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
                System.out.print("\t");
                System.out.println(tokenStream.getAttribute(TypeAttribute.class).type());

            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        ca.close();
    }
 
Example 19
Source File: DumpTermsApp.java    From lucene4ir with Apache License 2.0 4 votes vote down vote up
public void extractBigramsFromStoredText() throws IOException {

        HashMap<String, Integer> hmap = new HashMap<String, Integer>();
        int n = reader.maxDoc();

        for (int i = 0; i < n; i++) {

            Document doc = reader.document(i);
            String all = doc.get(lucene4ir.Lucene4IRConstants.FIELD_ALL);
            
            Analyzer a = new StandardAnalyzer();
            TokenStream ts = a.tokenStream(null, all);
            ts.reset();
            String w1 = "";
            String w2 = "";
            while (ts.incrementToken()) {
                w1 = w2;
                w2 = ts.getAttribute(CharTermAttribute.class).toString();
                if (w1 != "") {
                    //System.out.println(w1 + " " + w2);

                    String key = w1 + " " + w2;
                    if (hmap.containsKey(key)==true) {
                        int v = hmap.get(key);
                        hmap.put(key,v+1);
                    }
                    else {
                        hmap.put(key, 1);
                    }

                }
            }
        }

        Set set = hmap.entrySet();
        Iterator iterator = set.iterator();
        while(iterator.hasNext()) {
            Map.Entry me = (Map.Entry)iterator.next();
            if ((int)me.getValue() > 2) {
                System.out.print(me.getKey() + ": ");
                System.out.println(me.getValue());
            }
        }

    }
 
Example 20
Source File: TokenSources.java    From lucene-solr with Apache License 2.0 3 votes vote down vote up
/**
 * Get a token stream from either un-inverting a term vector if possible, or by analyzing the text.
 *
 * WARNING: Don't call this if there is more than one value for this field.  If there are, and if there are term
 * vectors, then there is a single tokenstream with offsets suggesting all the field values were concatenated.
 *
 * @param field The field to either get term vectors from or to analyze the text from.
 * @param tvFields from {@link IndexReader#getTermVectors(int)}. Possibly null. For performance, this instance should
 *                 be re-used for the same document (e.g. when highlighting multiple fields).
 * @param text the text to analyze, failing term vector un-inversion
 * @param analyzer the analyzer to analyze {@code text} with, failing term vector un-inversion
 * @param maxStartOffset Terms with a startOffset greater than this aren't returned.  Use -1 for no limit.
 *                       Suggest using {@link Highlighter#getMaxDocCharsToAnalyze()} - 1.
 *
 * @return a token stream from either term vectors, or from analyzing the text. Never null.
 */
public static TokenStream getTokenStream(String field, Fields tvFields, String text, Analyzer analyzer,
                                         int maxStartOffset) throws IOException {
  TokenStream tokenStream = getTermVectorTokenStreamOrNull(field, tvFields, maxStartOffset);
  if (tokenStream != null) {
    return tokenStream;
  }
  tokenStream = analyzer.tokenStream(field, text);
  if (maxStartOffset >= 0 && maxStartOffset < text.length() - 1) {
    tokenStream = new LimitTokenOffsetFilter(tokenStream, maxStartOffset);
  }
  return tokenStream;
}