dk.brics.automaton.RegExp Java Examples

The following examples show how to use dk.brics.automaton.RegExp. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FlamdexUtils.java    From imhotep with Apache License 2.0 5 votes vote down vote up
public static ThreadSafeBitSet cacheRegex(final String field, final String regex, final FlamdexReader reader) {
    final Automaton automaton = new RegExp(regex).toAutomaton();
    final ThreadSafeBitSet ret = new ThreadSafeBitSet(reader.getNumDocs());
    if (reader.getIntFields().contains(field)) {
        cacheIntFieldRegex(field, reader, automaton, ret);
    } else if (reader.getStringFields().contains(field)) {
        cacheStringFieldRegex(field, reader, automaton, ret);
    } else {
        // No exception on unknown field because fields can be added and queries can legitimately cross boundaries
        // where the field isn't defined. Instead, just return an empty bitset.
    }
    return ret;
}
 
Example #2
Source File: MultiPattern.java    From multiregexp with MIT License 5 votes vote down vote up
public MultiPatternAutomaton makeAutomatonWithPrefix(String prefix) {
    final List<Automaton> automata = new ArrayList<>();
    for (final String ptn: this.patterns) {
        final String prefixedPattern = prefix + ptn;
        final Automaton automaton = new RegExp(prefixedPattern).toAutomaton();
        automaton.minimize();
        automata.add(automaton);
    }
    return MultiPatternAutomaton.make(automata);
}
 
Example #3
Source File: MultiPattern.java    From multiregexp with MIT License 5 votes vote down vote up
/**
 * Equivalent of Pattern.compile, but the result is only valid for pattern search.
 * The searcher will return the first occurrence of a pattern.
 *
 * This operation is costly, make sure to cache its result when performing
 * search with the same patterns against the different strings.
 *
 * @return A searcher object
 */
public MultiPatternSearcher searcher() {
    final MultiPatternAutomaton searcherAutomaton = makeAutomatonWithPrefix(".*");
    final List<Automaton> indidivualAutomatons = new ArrayList<>();
    for (final String pattern: this.patterns) {
        final Automaton automaton = new RegExp(pattern).toAutomaton();
        automaton.minimize();
        automaton.determinize();
        indidivualAutomatons.add(automaton);
    }
    return new MultiPatternSearcher(searcherAutomaton, indidivualAutomatons);
}
 
Example #4
Source File: UnicodeTest.java    From multiregexp with MIT License 5 votes vote down vote up
@Test
public void testAutomatonWithUnicode() {
    final RegExp regexp = new RegExp("([0-9]{2,4}年)?[0-9]{1,2}月[0-9]{1,2}日");
    final Automaton forwardAutomaton = regexp.toAutomaton();
    {
        final RunAutomaton runAutomaton = new RunAutomaton(forwardAutomaton);
        Assert.assertTrue(runAutomaton.run("1982年9月17日"));
        Assert.assertFalse(runAutomaton.run("1982年9月127日"));
    }
}
 
Example #5
Source File: StringPattern.java    From consulo with Apache License 2.0 5 votes vote down vote up
@Nonnull
public StringPattern matchesBrics(@NonNls @Nonnull final String s) {
  final String escaped = StringUtil.escapeToRegexp(s);
  if (escaped.equals(s)) {
    return equalTo(s);
  }

  StringBuilder sb = new StringBuilder(s.length()*5);
  for (int i = 0; i < s.length(); i++) {
    final char c = s.charAt(i);
    if(c == ' ') {
      sb.append("<whitespace>");
    }
    else
    //This is really stupid and inconvenient builder - it breaks any normal pattern with uppercase
    if(Character.isUpperCase(c)) {
      sb.append('[').append(Character.toUpperCase(c)).append(Character.toLowerCase(c)).append(']');
    }
    else
    {
      sb.append(c);
    }
  }
  final RegExp regExp = new RegExp(sb.toString());
  final Automaton automaton = regExp.toAutomaton(new DatatypesAutomatonProvider());
  final RunAutomaton runAutomaton = new RunAutomaton(automaton, true);

  return with(new ValuePatternCondition<String>("matchesBrics") {
    @Override
    public boolean accepts(@Nonnull String str, final ProcessingContext context) {
      if (!str.isEmpty() && (str.charAt(0) == '"' || str.charAt(0) == '\'')) str = str.substring(1);
      return runAutomaton.run(str);
    }

    @Override
    public Collection<String> getValues() {
      return Collections.singleton(s);
    }
  });
}
 
Example #6
Source File: AutomatonURLFilter.java    From anthelion with Apache License 2.0 4 votes vote down vote up
Rule(boolean sign, String regex) {
  super(sign, regex);
  automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
}
 
Example #7
Source File: ImhotepLocalSession.java    From imhotep with Apache License 2.0 4 votes vote down vote up
@Override
public void regexRegroup(String field, String regex, int targetGroup, int negativeGroup, int positiveGroup) throws ImhotepOutOfMemoryException {
    if (getNumGroups() > 2) {
        throw new IllegalStateException("regexRegroup should be applied as a filter when you have only one group");
    }
    if (targetGroup == 0) {
        clearZeroDocBitsets();
    }
    docIdToGroup =
            GroupLookupFactory.resize(docIdToGroup,
                    Math.max(negativeGroup, positiveGroup),
                    memory);

    final FastBitSetPooler bitSetPooler = new ImhotepBitSetPooler(memory);
    final FastBitSet docRemapped;
    try {
        docRemapped = bitSetPooler.create(numDocs);
    } catch (FlamdexOutOfMemoryException e) {
        throw new ImhotepOutOfMemoryException(e);
    }
    try {
        try (
            final StringTermIterator iter = flamdexReader.getStringTermIterator(field);
            final DocIdStream docIdStream = flamdexReader.getDocIdStream()
        ) {
            final Automaton automaton = new RegExp(regex).toAutomaton();

            while (iter.next()) {
                final String term = iter.term();

                if (automaton.run(term)) {
                    docIdStream.reset(iter);
                    remapPositiveDocs(docIdStream, docRemapped, targetGroup, positiveGroup);
                }
            }
        }
        remapNegativeDocs(docRemapped, targetGroup, negativeGroup);
    } finally {
        bitSetPooler.release(docRemapped.memoryUsage());
    }

    finalizeRegroup();
}
 
Example #8
Source File: CompiledAutomaton.java    From spork with Apache License 2.0 4 votes vote down vote up
public CompiledAutomaton( String rhsPattern ) {
    RegExp regexpr = new dk.brics.automaton.RegExp(rhsPattern, RegExp.NONE);
    Automaton auto = regexpr.toAutomaton();
    this.runauto = new RunAutomaton(auto, true);
}
 
Example #9
Source File: AutomatonURLFilter.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
Rule(boolean sign, String regex) {
  super(sign, regex);
  automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
}