Java Code Examples for org.apache.lucene.analysis.MockTokenizer#setEnableChecks()

The following examples show how to use org.apache.lucene.analysis.MockTokenizer#setEnableChecks() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestIDVersionPostingsFormat.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testMissingPayload() throws Exception {
  Directory dir = newDirectory();

  // MockAnalyzer minus maybePayload else it sometimes stuffs in an 8-byte payload!
  Analyzer a = new Analyzer() {
      @Override
      public TokenStreamComponents createComponents(String fieldName) {
        MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true, 100);
        tokenizer.setEnableChecks(true);
        MockTokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET);
        return new TokenStreamComponents(tokenizer, filt);
      }
    };
  IndexWriterConfig iwc = newIndexWriterConfig(a);
  iwc.setCodec(TestUtil.alwaysPostingsFormat(new IDVersionPostingsFormat()));
  RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc, false);
  Document doc = new Document();
  doc.add(newTextField("id", "id", Field.Store.NO));
  expectThrows(IllegalArgumentException.class, () -> {
    w.addDocument(doc);
    w.commit(false);
  });
           
  w.close();
  dir.close();
}
 
Example 2
Source File: TestCompoundWordTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testReset() throws Exception {
  CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz",
      "Aufgabe", "Überwachung");

  MockTokenizer wsTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wsTokenizer.setEnableChecks(false); // we will reset in a strange place
  wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
      wsTokenizer, dict,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
  
  CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
  assertTrue(tf.incrementToken());
  assertEquals("Rind", termAtt.toString());
  tf.end();
  tf.close();
  wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
}
 
Example 3
Source File: TestLimitTokenPositionFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testMaxPosition3WithSynomyms() throws IOException {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    MockTokenizer tokenizer = whitespaceMockTokenizer("one two three four five");
    // if we are consuming all tokens, we can use the checks, otherwise we can't
    tokenizer.setEnableChecks(consumeAll);

    SynonymMap.Builder builder = new SynonymMap.Builder(true);
    builder.add(new CharsRef("one"), new CharsRef("first"), true);
    builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
    builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
    CharsRefBuilder multiWordCharsRef = new CharsRefBuilder();
    SynonymMap.Builder.join(new String[]{"and", "indubitably", "single", "only"}, multiWordCharsRef);
    builder.add(new CharsRef("one"), multiWordCharsRef.get(), true);
    SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
    builder.add(new CharsRef("two"), multiWordCharsRef.get(), true);
    SynonymMap synonymMap = builder.build();
    @SuppressWarnings("deprecation")
    TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
    stream = new LimitTokenPositionFilter(stream, 3, consumeAll);

    // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
    assertTokenStreamContents(stream,
        new String[]{"one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger"},
        new int[]{1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0});
  }
}
 
Example 4
Source File: TestLimitTokenPositionFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testMaxPosition1() throws Exception {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
    MockTokenizer tokenizer = whitespaceMockTokenizer(reader);
    // if we are consuming all tokens, we can use the checks, otherwise we can't
    tokenizer.setEnableChecks(consumeAll);
    TokenStream stream = tokenizer;
    stream = tokenFilterFactory("LimitTokenPosition",
        LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1",
        LimitTokenPositionFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll)
    ).create(stream);
    assertTokenStreamContents(stream, new String[]{"A1"});
  }
}
 
Example 5
Source File: TestFingerprintFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testSingleToken() throws Exception {
  for (final boolean consumeAll : new boolean[] { true, false }) {
    MockTokenizer tokenizer = whitespaceMockTokenizer("A1");
    tokenizer.setEnableChecks(consumeAll);
    TokenStream stream = new FingerprintFilter(tokenizer);
    assertTokenStreamContents(stream, new String[] { "A1" });
  }
}
 
Example 6
Source File: TestFingerprintFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testCustomSeparator() throws Exception {
  for (final boolean consumeAll : new boolean[] { true, false }) {
    MockTokenizer tokenizer = whitespaceMockTokenizer("B2 A1 C3 B2");
    tokenizer.setEnableChecks(consumeAll);
    TokenStream stream = new FingerprintFilter(tokenizer,
        FingerprintFilter.DEFAULT_MAX_OUTPUT_TOKEN_SIZE, '_');
    assertTokenStreamContents(stream, new String[] { "A1_B2_C3" });
  }
}
 
Example 7
Source File: TestFingerprintFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testMaxFingerprintSize() throws Exception {
  for (final boolean consumeAll : new boolean[] { true, false }) {
    MockTokenizer tokenizer = whitespaceMockTokenizer("B2 A1 C3 D4 E5 F6 G7 H1");
    tokenizer.setEnableChecks(consumeAll);
    TokenStream stream = new FingerprintFilter(tokenizer, 4, ' ');
    assertTokenStreamContents(stream, new String[] {});
  }
}
 
Example 8
Source File: TestFingerprintFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testAllDupValues() throws Exception {
  for (final boolean consumeAll : new boolean[] { true, false }) {
    MockTokenizer tokenizer = whitespaceMockTokenizer("B2 B2");
    tokenizer.setEnableChecks(consumeAll);
    TokenStream stream = new FingerprintFilter(tokenizer);
    assertTokenStreamContents(stream, new String[] { "B2" });
  }
}
 
Example 9
Source File: TestLimitTokenOffsetFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void test() throws Exception {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    tokenizer.setReader(reader);
    tokenizer.setEnableChecks(consumeAll);
    TokenStream stream = tokenizer;
    stream = tokenFilterFactory("LimitTokenOffset",
        LimitTokenOffsetFilterFactory.MAX_START_OFFSET, "3",
        LimitTokenOffsetFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll)
    ).create(stream);
    assertTokenStreamContents(stream, new String[]{"A1", "B2"});
  }
}
 
Example 10
Source File: TestIndexWriterExceptions.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testDocumentsWriterExceptionFailOneDoc() throws Exception {
  Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) {
    @Override
    public TokenStreamComponents createComponents(String fieldName) {
      MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      tokenizer.setEnableChecks(false); // disable workflow checking as we forcefully close() in exceptional cases.
      return new TokenStreamComponents(tokenizer, new CrashingFilter(fieldName, tokenizer));
    }
  };
  for (int i = 0; i < 10; i++) {
    try (Directory dir = newDirectory();
         final IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer)
             .setMaxBufferedDocs(-1)
             .setRAMBufferSizeMB(random().nextBoolean() ? 0.00001 : Integer.MAX_VALUE)
             .setMergePolicy(new FilterMergePolicy(NoMergePolicy.INSTANCE) {
               @Override
               public boolean keepFullyDeletedSegment(IOSupplier<CodecReader> readerIOSupplier) {
                 return true;
               }
             }))) {
      Document doc = new Document();
      doc.add(newField("contents", "here are some contents", DocCopyIterator.custom5));
      writer.addDocument(doc);
      doc.add(newField("crash", "this should crash after 4 terms", DocCopyIterator.custom5));
      doc.add(newField("other", "this will not get indexed", DocCopyIterator.custom5));
      expectThrows(IOException.class, () -> {
        writer.addDocument(doc);
      });
      writer.commit();
      try (IndexReader reader = DirectoryReader.open(dir)) {
          assertEquals(2, reader.docFreq(new Term("contents", "here")));
          assertEquals(2, reader.maxDoc());
          assertEquals(1, reader.numDocs());
      }
    }
  }
}
 
Example 11
Source File: TestIndexWriterExceptions.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testExceptionJustBeforeFlush() throws IOException {
  Directory dir = newDirectory();

  final AtomicBoolean doCrash = new AtomicBoolean();

  Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) {
    @Override
    public TokenStreamComponents createComponents(String fieldName) {
      MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      tokenizer.setEnableChecks(false); // disable workflow checking as we forcefully close() in exceptional cases.
      TokenStream stream = tokenizer;
      if (doCrash.get()) {
        stream = new CrashingFilter(fieldName, stream);
      }
      return new TokenStreamComponents(tokenizer, stream);
    }
  };

  IndexWriter w = RandomIndexWriter.mockIndexWriter(random(), dir, 
                                                    newIndexWriterConfig(analyzer)
                                                      .setMaxBufferedDocs(2), 
                                                    new TestPoint1());
  Document doc = new Document();
  doc.add(newTextField("field", "a field", Field.Store.YES));
  w.addDocument(doc);

  Document crashDoc = new Document();
  crashDoc.add(newTextField("crash", "do it on token 4", Field.Store.YES));
  doCrash.set(true);
  expectThrows(IOException.class, () -> {
    w.addDocument(crashDoc);
  });

  w.addDocument(doc);
  w.close();
  dir.close();
}
 
Example 12
Source File: FuzzySuggesterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public TokenStreamComponents createComponents(String fieldName) {
  MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
  tokenizer.setEnableChecks(true);
  TokenStream next;
  if (numStopChars != 0) {
    next = new TokenEater(preserveHoles, tokenizer, numStopChars);
  } else {
    next = tokenizer;
  }
  return new TokenStreamComponents(tokenizer, next);
}
 
Example 13
Source File: TestLimitTokenCountFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void test() throws Exception {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    tokenizer.setReader(reader);
    tokenizer.setEnableChecks(consumeAll);
    TokenStream stream = tokenizer;
    stream = tokenFilterFactory("LimitTokenCount",
        LimitTokenCountFilterFactory.MAX_TOKEN_COUNT_KEY, "3",
        LimitTokenCountFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll)
    ).create(stream);
    assertTokenStreamContents(stream, new String[]{"A1", "B2", "C3"});
  }
}
 
Example 14
Source File: TestFingerprintFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void test() throws Exception {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    Reader reader = new StringReader("A1 B2 A1 D4 C3");
    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    tokenizer.setReader(reader);
    tokenizer.setEnableChecks(consumeAll);
    TokenStream stream = tokenizer;
    stream = tokenFilterFactory("Fingerprint",
        FingerprintFilterFactory.MAX_OUTPUT_TOKEN_SIZE_KEY, "256",
        FingerprintFilterFactory.SEPARATOR_KEY, "_"
    ).create(stream);
    assertTokenStreamContents(stream, new String[]{"A1_B2_C3_D4"});
  }
}
 
Example 15
Source File: TestLimitTokenCountFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void test() throws Exception {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    MockTokenizer tokenizer = whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6");
    tokenizer.setEnableChecks(consumeAll);
    TokenStream stream = new LimitTokenCountFilter(tokenizer, 3, consumeAll);
    assertTokenStreamContents(stream, new String[]{"A1", "B2", "C3"});
  }
}
 
Example 16
Source File: TestHunspellStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** simple test for longestOnly option */
public void testLongestOnly() throws IOException {
  MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
  tokenizer.setEnableChecks(true);
  HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, true, true);
  assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
}
 
Example 17
Source File: TestConcatenateGraphFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void test() throws Exception {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    final String input = "A1 B2 A1 D4 C3";
    Reader reader = new StringReader(input);
    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    tokenizer.setReader(reader);
    tokenizer.setEnableChecks(consumeAll);
    TokenStream stream = tokenizer;
    stream = tokenFilterFactory("ConcatenateGraph",
        "tokenSeparator", "\u001F"
    ).create(stream);
    assertTokenStreamContents(stream, new String[]{input.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)});
  }
}
 
Example 18
Source File: MinHashFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private static Tokenizer createMockShingleTokenizer(int shingleSize, String shingles) {
  MockTokenizer tokenizer = new MockTokenizer(
      new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+([ \t\r\n]+[^ \t\r\n]+){" + (shingleSize - 1) + "}").toAutomaton()),
      true);
  tokenizer.setEnableChecks(true);
  if (shingles != null) {
    tokenizer.setReader(new StringReader(shingles));
  }
  return tokenizer;
}
 
Example 19
Source File: AnalyzingSuggesterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public TokenStreamComponents createComponents(String fieldName) {
  MockTokenizer tokenizer = new MockTokenizer(MockUTF16TermAttributeImpl.UTF16_TERM_ATTRIBUTE_FACTORY,
      MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
  tokenizer.setEnableChecks(true);
  TokenStream next;
  if (numStopChars != 0) {
    next = new TokenEater(preserveHoles, tokenizer, numStopChars);
  } else {
    next = tokenizer;
  }
  return new TokenStreamComponents(tokenizer, next);
}
 
Example 20
Source File: MockTokenizerFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public MockTokenizer create(AttributeFactory factory) {
  MockTokenizer t = new MockTokenizer(factory, pattern, false);
  t.setEnableChecks(enableChecks);
  return t;
}