org.apache.lucene.util.automaton.RegExp Java Examples

The following examples show how to use org.apache.lucene.util.automaton.RegExp. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: QueryParserTestBase.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testStopwords() throws Exception {
  CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
  CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
  Query result = getQuery("field:the OR field:foo",qp);
  assertNotNull("result is null and it shouldn't be", result);
  assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery || result instanceof MatchNoDocsQuery);
  if (result instanceof BooleanQuery) {
    assertEquals(0, ((BooleanQuery) result).clauses().size());
  }
  result = getQuery("field:woo OR field:the",qp);
  assertNotNull("result is null and it shouldn't be", result);
  assertTrue("result is not a TermQuery", result instanceof TermQuery);
  result = getQuery("(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)",qp);
  assertNotNull("result is null and it shouldn't be", result);
  assertTrue("result is not a BoostQuery", result instanceof BoostQuery);
  result = ((BoostQuery) result).getQuery();
  assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
  if (VERBOSE) System.out.println("Result: " + result);
  assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 2, ((BooleanQuery) result).clauses().size() == 2);
}
 
Example #2
Source File: MtasToken.java    From mtas with Apache License 2.0 6 votes vote down vote up
/**
 * Creates the automaton map.
 *
 * @param prefix the prefix
 * @param valueList the value list
 * @param filter the filter
 * @return the map
 */
public static Map<String, Automaton> createAutomatonMap(String prefix,
    List<String> valueList, Boolean filter) {
  HashMap<String, Automaton> automatonMap = new HashMap<>();
  if (valueList != null) {
    for (String item : valueList) {
      if (filter) {
        item = item.replaceAll("([\\\"\\)\\(\\<\\>\\.\\@\\#\\]\\[\\{\\}])",
            "\\\\$1");
      }
      automatonMap.put(item,
          new RegExp(prefix + MtasToken.DELIMITER + item + "\u0000*")
              .toAutomaton());
    }
  }
  return automatonMap;
}
 
Example #3
Source File: MockTokenFilterFactory.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Creates a new MockTokenizerFactory */
public MockTokenFilterFactory(Map<String, String> args) {
  super(args);
  String stopset = get(args, "stopset", Arrays.asList("english", "empty"), null, false);
  String stopregex = get(args, "stopregex");
  if (null != stopset) {
    if (null != stopregex) {
      throw new IllegalArgumentException("Parameters stopset and stopregex cannot both be specified.");
    }
    if ("english".equalsIgnoreCase(stopset)) {
      filter = MockTokenFilter.ENGLISH_STOPSET;
    } else { // must be "empty"
      filter = MockTokenFilter.EMPTY_STOPSET;
    }
  } else if (null != stopregex) {
    RegExp regex = new RegExp(stopregex);
    filter = new CharacterRunAutomaton(regex.toAutomaton());
  } else {
    throw new IllegalArgumentException
        ("Configuration Error: either the 'stopset' or the 'stopregex' parameter must be specified.");
  }
  if (!args.isEmpty()) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
}
 
Example #4
Source File: TestMockAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Test a configuration where word starts with one uppercase */
public void testUppercase() throws Exception {
  CharacterRunAutomaton single =
      new CharacterRunAutomaton(new RegExp("[A-Z][a-z]*").toAutomaton());
  Analyzer a = new MockAnalyzer(random(), single, false);
  assertAnalyzesTo(a, "FooBarBAZ",
      new String[] { "Foo", "Bar", "B", "A", "Z"},
      new int[] { 0, 3, 6, 7, 8 },
      new int[] { 3, 6, 7, 8, 9 }
  );
  assertAnalyzesTo(a, "aFooBar",
      new String[] { "Foo", "Bar" },
      new int[] { 1, 4 },
      new int[] { 4, 7 }
  );
  checkRandomData(random(), a, 100);
}
 
Example #5
Source File: HighlighterTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testMaxSizeEndHighlight() throws Exception {
  TestHighlightRunner helper = new TestHighlightRunner() {
    @Override
    public void run() throws Exception {
      CharacterRunAutomaton stopWords = new CharacterRunAutomaton(new RegExp("i[nt]").toAutomaton());
      TermQuery query = new TermQuery(new Term("text", "searchterm"));

      String text = "this is a text with searchterm in it";
      SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
      Highlighter hg = getHighlighter(query, "text", fm);
      hg.setTextFragmenter(new NullFragmenter());
      hg.setMaxDocCharsToAnalyze(36);
      String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords), "text", text);
      assertTrue(
          "Matched text should contain remainder of text after highlighted query ",
          match.endsWith("in it"));
    }
  };
  helper.start();
}
 
Example #6
Source File: TestIntervals.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testMultiTerm() throws IOException {
  RegExp re = new RegExp("p.*e");
  IntervalsSource source = Intervals.multiterm(new CompiledAutomaton(re.toAutomaton()), re.toString());

  checkIntervals(source, "field1", 5, new int[][]{
      {},
      { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 },
      { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 },
      { 7, 7 },
      { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 },
      { 0, 0 }
  });

  IllegalStateException e = expectThrows(IllegalStateException.class, () -> {
    IntervalsSource s = Intervals.multiterm(new CompiledAutomaton(re.toAutomaton()), 1, re.toString());
    for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
      s.intervals("field1", ctx);
    }
  });
  assertEquals("Automaton [\\p(.)*\\e] expanded to too many terms (limit 1)", e.getMessage());

  checkVisits(source, 1);
}
 
Example #7
Source File: TestMockAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Test a configuration where three characters makes a term */
public void testThreeChars() throws Exception {
  CharacterRunAutomaton single =
      new CharacterRunAutomaton(new RegExp("...").toAutomaton());
  Analyzer a = new MockAnalyzer(random(), single, false);
  assertAnalyzesTo(a, "foobar",
      new String[] { "foo", "bar"},
      new int[] { 0, 3 },
      new int[] { 3, 6 }
  );
  // make sure when last term is a "partial" match that end() is correct
  assertTokenStreamContents(a.tokenStream("bogus", "fooba"),
      new String[] { "foo" },
      new int[] { 0 },
      new int[] { 3 },
      new int[] { 1 },
      5
  );
  checkRandomData(random(), a, 100);
}
 
Example #8
Source File: TestMockAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Test a configuration where two characters makes a term */
public void testTwoChars() throws Exception {
  CharacterRunAutomaton single =
      new CharacterRunAutomaton(new RegExp("..").toAutomaton());
  Analyzer a = new MockAnalyzer(random(), single, false);
  assertAnalyzesTo(a, "foobar",
      new String[] { "fo", "ob", "ar"},
      new int[] { 0, 2, 4 },
      new int[] { 2, 4, 6 }
  );
  // make sure when last term is a "partial" match that end() is correct
  assertTokenStreamContents(a.tokenStream("bogus", "fooba"),
      new String[] { "fo", "ob" },
      new int[] { 0, 2 },
      new int[] { 2, 4 },
      new int[] { 1, 1 },
      5
  );
  checkRandomData(random(), a, 100);
}
 
Example #9
Source File: TestRegexpQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testCustomProvider() throws IOException {
  AutomatonProvider myProvider = new AutomatonProvider() {
    // automaton that matches quick or brown
    private Automaton quickBrownAutomaton = Operations.union(Arrays
        .asList(Automata.makeString("quick"),
        Automata.makeString("brown"),
        Automata.makeString("bob")));
    
    @Override
    public Automaton getAutomaton(String name) {
      if (name.equals("quickBrown")) return quickBrownAutomaton;
      else return null;
    }
  };
  RegexpQuery query = new RegexpQuery(newTerm("<quickBrown>"), RegExp.ALL,
    myProvider, DEFAULT_MAX_DETERMINIZED_STATES);
  assertEquals(1, searcher.search(query, 5).totalHits.value);
}
 
Example #10
Source File: TestFieldCacheRewriteMethod.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Test fieldcache rewrite against filter rewrite */
@Override
protected void assertSame(String regexp) throws IOException {   
  RegexpQuery fieldCache = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
  fieldCache.setRewriteMethod(new DocValuesRewriteMethod());
  
  RegexpQuery filter = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
  filter.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
  
  TopDocs fieldCacheDocs = searcher1.search(fieldCache, 25);
  TopDocs filterDocs = searcher2.search(filter, 25);

  CheckHits.checkEqual(fieldCache, fieldCacheDocs.scoreDocs, filterDocs.scoreDocs);
}
 
Example #11
Source File: QueryParserTestBase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testPhraseQueryPositionIncrements() throws Exception {
  CharacterRunAutomaton stopStopList =
  new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton());

  CommonQueryParserConfiguration qp
      = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList));
  qp.setEnablePositionIncrements(true);

  PhraseQuery.Builder phraseQuery = new PhraseQuery.Builder();
  phraseQuery.add(new Term("field", "1"));
  phraseQuery.add(new Term("field", "2"), 2);
  assertEquals(phraseQuery.build(), getQuery("\"1 stop 2\"",qp));
}
 
Example #12
Source File: TestDocValuesRewriteMethod.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** check that the # of hits is the same as if the query
 * is run against the inverted index
 */
protected void assertSame(String regexp) throws IOException {
  RegexpQuery docValues = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
  docValues.setRewriteMethod(new DocValuesRewriteMethod());
  RegexpQuery inverted = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
 
  TopDocs invertedDocs = searcher1.search(inverted, 25);
  TopDocs docValuesDocs = searcher2.search(docValues, 25);

  CheckHits.checkEqual(inverted, invertedDocs.scoreDocs, docValuesDocs.scoreDocs);
}
 
Example #13
Source File: TestDocValuesRewriteMethod.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEquals() throws Exception {
  RegexpQuery a1 = new RegexpQuery(new Term(fieldName, "[aA]"), RegExp.NONE);
  RegexpQuery a2 = new RegexpQuery(new Term(fieldName, "[aA]"), RegExp.NONE);
  RegexpQuery b = new RegexpQuery(new Term(fieldName, "[bB]"), RegExp.NONE);
  assertEquals(a1, a2);
  assertFalse(a1.equals(b));
  
  a1.setRewriteMethod(new DocValuesRewriteMethod());
  a2.setRewriteMethod(new DocValuesRewriteMethod());
  b.setRewriteMethod(new DocValuesRewriteMethod());
  assertEquals(a1, a2);
  assertFalse(a1.equals(b));
  QueryUtils.check(a1);
}
 
Example #14
Source File: TestSpanFirstQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testStartPositions() throws Exception {
  Directory dir = newDirectory();
  
  // mimic StopAnalyzer
  CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|a|of").toAutomaton());
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
  
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
  Document doc = new Document();
  doc.add(newTextField("field", "the quick brown fox", Field.Store.NO));
  writer.addDocument(doc);
  Document doc2 = new Document();
  doc2.add(newTextField("field", "quick brown fox", Field.Store.NO));
  writer.addDocument(doc2);
  
  IndexReader reader = writer.getReader();
  IndexSearcher searcher = newSearcher(reader);
  
  // user queries on "starts-with quick"
  SpanQuery sfq = spanFirstQuery(spanTermQuery("field", "quick"), 1);
  assertEquals(1, searcher.search(sfq, 10).totalHits.value);
  
  // user queries on "starts-with the quick"
  SpanQuery include = spanFirstQuery(spanTermQuery("field", "quick"), 2);
  sfq = spanNotQuery(include, sfq);
  assertEquals(1, searcher.search(sfq, 10).totalHits.value);
  
  writer.close();
  reader.close();
  dir.close();
}
 
Example #15
Source File: TestRegexpRandom2.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** check that the # of hits is the same as from a very
 * simple regexpquery implementation.
 */
protected void assertSame(String regexp) throws IOException {   
  RegexpQuery smart = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
  DumbRegexpQuery dumb = new DumbRegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
 
  TopDocs smartDocs = searcher1.search(smart, 25);
  TopDocs dumbDocs = searcher2.search(dumb, 25);

  CheckHits.checkEqual(smart, smartDocs.scoreDocs, dumbDocs.scoreDocs);
}
 
Example #16
Source File: MapperQueryParser.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
private Query getRegexpQuerySingle(String field, String termStr) throws ParseException {
    currentFieldType = null;
    Analyzer oldAnalyzer = getAnalyzer();
    try {
        currentFieldType = parseContext.fieldMapper(field);
        if (currentFieldType != null) {
            if (!forcedAnalyzer) {
                setAnalyzer(parseContext.getSearchAnalyzer(currentFieldType));
            }
            Query query = null;
            if (currentFieldType.useTermQueryWithQueryString()) {
                query = currentFieldType.regexpQuery(termStr, RegExp.ALL, maxDeterminizedStates, multiTermRewriteMethod, parseContext);
            }
            if (query == null) {
                query = super.getRegexpQuery(field, termStr);
            }
            return query;
        }
        return super.getRegexpQuery(field, termStr);
    } catch (RuntimeException e) {
        if (settings.lenient()) {
            return null;
        }
        throw e;
    } finally {
        setAnalyzer(oldAnalyzer);
    }
}
 
Example #17
Source File: TestFieldCacheRewriteMethod.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEquals() throws Exception {
  RegexpQuery a1 = new RegexpQuery(new Term(fieldName, "[aA]"), RegExp.NONE);
  RegexpQuery a2 = new RegexpQuery(new Term(fieldName, "[aA]"), RegExp.NONE);
  RegexpQuery b = new RegexpQuery(new Term(fieldName, "[bB]"), RegExp.NONE);
  assertEquals(a1, a2);
  assertFalse(a1.equals(b));
  
  a1.setRewriteMethod(new DocValuesRewriteMethod());
  a2.setRewriteMethod(new DocValuesRewriteMethod());
  b.setRewriteMethod(new DocValuesRewriteMethod());
  assertEquals(a1, a2);
  assertFalse(a1.equals(b));
  QueryUtils.check(a1);
}
 
Example #18
Source File: TestTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testIntersectRegexp() throws Exception {
  Directory d = newDirectory();
  RandomIndexWriter w = new RandomIndexWriter(random(), d);
  Document doc = new Document();
  doc.add(newStringField("field", "foobar", Field.Store.NO));
  w.addDocument(doc);
  IndexReader r = w.getReader();
  Terms terms = MultiTerms.getTerms(r, "field");
  CompiledAutomaton automaton = new CompiledAutomaton(new RegExp("do_not_match_anything").toAutomaton());
  String message = expectThrows(IllegalArgumentException.class, () -> {terms.intersect(automaton, null);}).getMessage();
  assertEquals("please use CompiledAutomaton.getTermsEnum instead", message);
  r.close();
  w.close();
  d.close();
}
 
Example #19
Source File: TestQueryBuilder.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testPhraseQueryPositionIncrements() throws Exception {
  PhraseQuery.Builder pqBuilder = new PhraseQuery.Builder();
  pqBuilder.add(new Term("field", "1"), 0);
  pqBuilder.add(new Term("field", "2"), 2);
  PhraseQuery expected = pqBuilder.build();
  CharacterRunAutomaton stopList = new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton());

  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopList);

  QueryBuilder builder = new QueryBuilder(analyzer);
  assertEquals(expected, builder.createPhraseQuery("field", "1 stop 2"));
}
 
Example #20
Source File: TestBlockPostingsFormat3.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception {
  if (leftTerms == null || rightTerms == null) {
    assertNull(leftTerms);
    assertNull(rightTerms);
    return;
  }
  assertTermsStatistics(leftTerms, rightTerms);
  
  // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different

  boolean bothHaveFreqs = leftTerms.hasFreqs() && rightTerms.hasFreqs();
  boolean bothHavePositions = leftTerms.hasPositions() && rightTerms.hasPositions();
  TermsEnum leftTermsEnum = leftTerms.iterator();
  TermsEnum rightTermsEnum = rightTerms.iterator();
  assertTermsEnum(leftTermsEnum, rightTermsEnum, true, bothHaveFreqs, bothHavePositions);
  
  assertTermsSeeking(leftTerms, rightTerms);
  
  if (deep) {
    int numIntersections = atLeast(3);
    for (int i = 0; i < numIntersections; i++) {
      String re = AutomatonTestUtil.randomRegexp(random());
      CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
      if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
        // TODO: test start term too
        TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
        TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
        assertTermsEnum(leftIntersection, rightIntersection, rarely(), bothHaveFreqs, bothHavePositions);
      }
    }
  }
}
 
Example #21
Source File: RegexCriteriaValidator.java    From nexus-public with Eclipse Public License 1.0 5 votes vote down vote up
/**
 * Ensures that a regular expression entered is a valid pattern.
 *
 * @param expression
 * @throws InvalidExpressionException when the expression is deemed invalid
 */
public static String validate(final String expression) {
  try {
    new RegExp(expression);
  }
  catch (IllegalArgumentException e) {  // NOSONAR
    throw new InvalidExpressionException(
        format("Invalid regular expression pattern: %s", e.getMessage()));
  }
  return expression;
}
 
Example #22
Source File: TestMockAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Test a configuration that behaves a lot like LengthFilter */
public void testLength() throws Exception {
  CharacterRunAutomaton length5 = new CharacterRunAutomaton(new RegExp(".{5,}").toAutomaton());
  Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, length5);
  assertAnalyzesTo(a, "ok toolong fine notfine",
      new String[] { "ok", "fine" },
      new int[] { 1, 2 });
}
 
Example #23
Source File: IncludeExclude.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
/**
 * @param include   The regular expression pattern for the terms to be included
 * @param exclude   The regular expression pattern for the terms to be excluded
 */
public IncludeExclude(RegExp include, RegExp exclude) {
    if (include == null && exclude == null) {
        throw new IllegalArgumentException();
    }
    this.include = include;
    this.exclude = exclude;
    this.includeValues = null;
    this.excludeValues = null;
}
 
Example #24
Source File: IncludeExclude.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
public IncludeExclude includeExclude() {
    RegExp includePattern =  include != null ? new RegExp(include) : null;
    RegExp excludePattern = exclude != null ? new RegExp(exclude) : null;
    if (includePattern != null || excludePattern != null) {
        if (includeValues != null || excludeValues != null) {
            throw new IllegalArgumentException("Can only use regular expression include/exclude or a set of values, not both");
        }
        return new IncludeExclude(includePattern, excludePattern);
    } else if (includeValues != null || excludeValues != null) {
        return new IncludeExclude(includeValues, excludeValues);
    } else {
        return null;
    }
}
 
Example #25
Source File: RegexCompletionQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
  // If an empty regex is provided, we return an automaton that matches nothing. This ensures
  // consistency with PrefixCompletionQuery, which returns no results for an empty term.
  Automaton automaton = getTerm().text().isEmpty()
      ? Automata.makeEmpty()
      : new RegExp(getTerm().text(), flags).toAutomaton(maxDeterminizedStates);
  return new CompletionWeight(this, automaton);
}
 
Example #26
Source File: SimplePatternSplitTokenizerFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Creates a new SimpleSplitPatternTokenizerFactory */
public SimplePatternSplitTokenizerFactory(Map<String,String> args) {
  super(args);
  maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES);
  dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates);
  if (args.isEmpty() == false) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
}
 
Example #27
Source File: SimplePatternTokenizerFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Creates a new SimplePatternTokenizerFactory */
public SimplePatternTokenizerFactory(Map<String,String> args) {
  super(args);
  maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES);
  dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates);
  if (args.isEmpty() == false) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
}
 
Example #28
Source File: MinHashFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private static Tokenizer createMockShingleTokenizer(int shingleSize, String shingles) {
  MockTokenizer tokenizer = new MockTokenizer(
      new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+([ \t\r\n]+[^ \t\r\n]+){" + (shingleSize - 1) + "}").toAutomaton()),
      true);
  tokenizer.setEnableChecks(true);
  if (shingles != null) {
    tokenizer.setReader(new StringReader(shingles));
  }
  return tokenizer;
}
 
Example #29
Source File: LuceneTestCase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** 
 * Terms api equivalency 
 */
public void assertTermsEquals(String info, IndexReader leftReader, Terms leftTerms, Terms rightTerms, boolean deep) throws IOException {
  if (leftTerms == null || rightTerms == null) {
    assertNull(info, leftTerms);
    assertNull(info, rightTerms);
    return;
  }
  assertTermsStatisticsEquals(info, leftTerms, rightTerms);
  assertEquals("hasOffsets", leftTerms.hasOffsets(), rightTerms.hasOffsets());
  assertEquals("hasPositions", leftTerms.hasPositions(), rightTerms.hasPositions());
  assertEquals("hasPayloads", leftTerms.hasPayloads(), rightTerms.hasPayloads());

  TermsEnum leftTermsEnum = leftTerms.iterator();
  TermsEnum rightTermsEnum = rightTerms.iterator();
  assertTermsEnumEquals(info, leftReader, leftTermsEnum, rightTermsEnum, true);
  
  assertTermsSeekingEquals(info, leftTerms, rightTerms);
  
  if (deep) {
    int numIntersections = atLeast(3);
    for (int i = 0; i < numIntersections; i++) {
      String re = AutomatonTestUtil.randomRegexp(random());
      CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
      if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
        // TODO: test start term too
        TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
        TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
        assertTermsEnumEquals(info, leftReader, leftIntersection, rightIntersection, rarely());
      }
    }
  }
}
 
Example #30
Source File: TestMockAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Test a configuration where each character is a term */
public void testSingleChar() throws Exception {
  CharacterRunAutomaton single =
      new CharacterRunAutomaton(new RegExp(".").toAutomaton());
  Analyzer a = new MockAnalyzer(random(), single, false);
  assertAnalyzesTo(a, "foobar",
      new String[] { "f", "o", "o", "b", "a", "r" },
      new int[] { 0, 1, 2, 3, 4, 5 },
      new int[] { 1, 2, 3, 4, 5, 6 }
  );
  checkRandomData(random(), a, 100);
}