package mtas.search.spans; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.search.spans.SpanCollector; import org.apache.lucene.search.spans.Spans; import mtas.search.spans.util.MtasIgnoreItem; import mtas.search.spans.util.MtasSpans; /** * The Class MtasSpanRecurrenceSpans. */ public class MtasSpanRecurrenceSpans extends MtasSpans { /** The log. */ private static Log log = LogFactory.getLog(MtasSpanRecurrenceSpans.class); /** The query. */ private MtasSpanRecurrenceQuery query; /** The spans. */ private Spans spans; /** The ignore item. */ private MtasIgnoreItem ignoreItem; /** The minimum recurrence. */ int minimumRecurrence; /** The maximum recurrence. */ int maximumRecurrence; /** The queue spans. */ List<Match> queueSpans; /** The queue matches. */ List<Match> queueMatches; /** The current match. */ Match currentMatch; /** The no more positions. */ boolean noMorePositions; /** The last start position. */ int lastStartPosition; // startPosition of last retrieved span /** The last span. */ boolean lastSpan; // last span for this document added to queue /** * Instantiates a new mtas span recurrence spans. * * @param query the query * @param spans the spans * @param minimumRecurrence the minimum recurrence * @param maximumRecurrence the maximum recurrence * @param ignoreSpans the ignore spans * @param maximumIgnoreLength the maximum ignore length */ public MtasSpanRecurrenceSpans(MtasSpanRecurrenceQuery query, Spans spans, int minimumRecurrence, int maximumRecurrence, Spans ignoreSpans, Integer maximumIgnoreLength) { assert minimumRecurrence <= maximumRecurrence : "minimumRecurrence > maximumRecurrence"; assert minimumRecurrence > 0 : "minimumRecurrence < 1 not supported"; this.query = query; this.spans = spans; this.minimumRecurrence = minimumRecurrence; this.maximumRecurrence = maximumRecurrence; queueSpans = new ArrayList<>(); queueMatches = new ArrayList<>(); ignoreItem = new MtasIgnoreItem(ignoreSpans, maximumIgnoreLength); resetQueue(); } /* * (non-Javadoc) * * @see org.apache.lucene.search.spans.Spans#nextStartPosition() */ @Override public int nextStartPosition() throws IOException { if (findMatches()) { currentMatch = queueMatches.get(0); queueMatches.remove(0); noMorePositions = false; return currentMatch.startPosition(); } else { currentMatch = null; noMorePositions = true; return NO_MORE_POSITIONS; } } /* * (non-Javadoc) * * @see org.apache.lucene.search.spans.Spans#startPosition() */ @Override public int startPosition() { if (currentMatch == null) { if (noMorePositions) { return NO_MORE_POSITIONS; } else { return -1; } } else { return currentMatch.startPosition(); } } /* * (non-Javadoc) * * @see org.apache.lucene.search.spans.Spans#endPosition() */ @Override public int endPosition() { if (currentMatch == null) { if (noMorePositions) { return NO_MORE_POSITIONS; } else { return -1; } } else { return currentMatch.endPosition(); } } /* * (non-Javadoc) * * @see org.apache.lucene.search.spans.Spans#width() */ @Override public int width() { return 1; } /* * (non-Javadoc) * * @see * org.apache.lucene.search.spans.Spans#collect(org.apache.lucene.search.spans * .SpanCollector) */ @Override public void collect(SpanCollector collector) throws IOException { spans.collect(collector); } /* * (non-Javadoc) * * @see org.apache.lucene.search.DocIdSetIterator#docID() */ @Override public int docID() { return spans.docID(); } /* * (non-Javadoc) * * @see org.apache.lucene.search.DocIdSetIterator#nextDoc() */ @Override public int nextDoc() throws IOException { resetQueue(); return (spans.nextDoc() == NO_MORE_DOCS) ? NO_MORE_DOCS : toMatchDoc(); } /* * (non-Javadoc) * * @see org.apache.lucene.search.DocIdSetIterator#advance(int) */ @Override public int advance(int target) throws IOException { resetQueue(); return (spans.advance(target) == NO_MORE_DOCS) ? NO_MORE_DOCS : toMatchDoc(); } /** * Reset queue. */ void resetQueue() { queueSpans.clear(); queueMatches.clear(); lastStartPosition = 0; lastSpan = false; currentMatch = null; noMorePositions = false; } /** * To match doc. * * @return the int * @throws IOException Signals that an I/O exception has occurred. */ int toMatchDoc() throws IOException { while (true) { if (findMatches()) { return docID(); } resetQueue(); if (spans.nextDoc() == NO_MORE_DOCS) { return NO_MORE_DOCS; } } } /** * Collect span. * * @return true, if successful * @throws IOException Signals that an I/O exception has occurred. */ // try to get something in the queue of spans private boolean collectSpan() throws IOException { if (lastSpan) { return false; } else if (spans.nextStartPosition() == NO_MORE_POSITIONS) { lastSpan = true; return false; } else { queueSpans.add(new Match(spans.startPosition(), spans.endPosition())); lastStartPosition = spans.startPosition(); return true; } } /** * Find matches. * * @return true, if successful * @throws IOException Signals that an I/O exception has occurred. */ private boolean findMatches() throws IOException { // check for something in queue of matches if (!queueMatches.isEmpty()) { return true; } else { ignoreItem.advanceToDoc(spans.docID()); while (true) { // try to get something in queue of spans if (queueSpans.isEmpty() && !collectSpan()) { return false; } // try to get matches with first span in queue Match firstMatch = queueSpans.remove(0); // create a list of matches with same startPosition as firstMatch List<Match> matches = new ArrayList<>(); matches.add(firstMatch); // matches.addAll(expandWithIgnoreItem(spans.docID(), firstMatch)); // try to collect spans until lastStartPosition not equal to // startPosition of firstMatch while (!lastSpan && (lastStartPosition == firstMatch.startPosition())) { collectSpan(); } while (!queueSpans.isEmpty() && (queueSpans.get(0) .startPosition() == firstMatch.startPosition())) { Match additionalMatch = queueSpans.remove(0); matches.add(additionalMatch); matches.addAll(expandWithIgnoreItem(spans.docID(), additionalMatch)); } // construct all matches for this startPosition for (Match match : matches) { for (int n = (minimumRecurrence - 1); n <= (maximumRecurrence - 1); n++) { findMatches(match, n); } } // check for something in queue of matches if (!queueMatches.isEmpty()) { ignoreItem.removeBefore(spans.docID(), queueMatches.get(0).startPosition()); return true; } } } } /** * Find matches. * * @param match the match * @param n the n * @throws IOException Signals that an I/O exception has occurred. */ private void findMatches(Match match, int n) throws IOException { if (n > 0) { int largestMatchingEndPosition = match.endPosition(); Set<Integer> list = ignoreItem.getFullEndPositionList(spans.docID(), match.endPosition()); // try to find matches with existing queue if (!queueSpans.isEmpty()) { Match span; for (int i = 0; i < queueSpans.size(); i++) { span = queueSpans.get(i); if (match.endPosition() == span.startPosition() || (list != null && list.contains(span.startPosition()))) { findMatches(new Match(match.startPosition(), span.endPosition()), (n - 1)); largestMatchingEndPosition = Math.max(largestMatchingEndPosition, span.endPosition()); } } } // extend queue if necessary and possible while (!lastSpan && (largestMatchingEndPosition >= lastStartPosition)) { if (spans.nextStartPosition() == NO_MORE_POSITIONS) { lastSpan = true; } else { Match span = new Match(spans.startPosition(), spans.endPosition()); queueSpans.add(span); lastStartPosition = spans.startPosition(); // check if this provides new match if (match.endPosition() == span.startPosition() || (list != null && list.contains(span.startPosition()))) { findMatches(new Match(match.startPosition(), span.endPosition()), (n - 1)); largestMatchingEndPosition = Math.max(largestMatchingEndPosition, span.endPosition()); } } } } else { // only unique spans if (!queueMatches.contains(match)) { queueMatches.add(match); } } } /** * Expand with ignore item. * * @param docId the doc id * @param match the match * @return the list */ private List<Match> expandWithIgnoreItem(int docId, Match match) { List<Match> list = new ArrayList<>(); try { Set<Integer> ignoreList = ignoreItem.getFullEndPositionList(docId, match.endPosition); if (ignoreList != null) { for (Integer endPosition : ignoreList) { list.add(new Match(match.startPosition, endPosition)); } } } catch (IOException e) { log.debug(e); } return list; } /** * The Class Match. */ private static class Match { /** The start position. */ private int startPosition; /** The end position. */ private int endPosition; /** * Instantiates a new match. * * @param startPosition the start position * @param endPosition the end position */ Match(int startPosition, int endPosition) { this.startPosition = startPosition; this.endPosition = endPosition; } /** * Start position. * * @return the int */ public int startPosition() { return startPosition; } /** * End position. * * @return the int */ public int endPosition() { return endPosition; } /* * (non-Javadoc) * * @see java.lang.Object#equals(java.lang.Object) */ @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; final Match that = (Match) obj; return startPosition == that.startPosition && endPosition == that.endPosition; } /* * (non-Javadoc) * * @see java.lang.Object#hashCode() */ @Override public int hashCode() { int h = this.getClass().getSimpleName().hashCode(); h = (h * 5) ^ startPosition; h = (h * 7) ^ endPosition; return h; } } /* * (non-Javadoc) * * @see org.apache.lucene.search.DocIdSetIterator#cost() */ @Override public long cost() { return (spans == null) ? 0 : spans.cost(); } /* * (non-Javadoc) * * @see org.apache.lucene.search.spans.Spans#positionsCost() */ @Override public float positionsCost() { return 0; } /* * (non-Javadoc) * * @see mtas.search.spans.util.MtasSpans#asTwoPhaseIterator() */ @Override public TwoPhaseIterator asTwoPhaseIterator() { if (spans == null || !query.twoPhaseIteratorAllowed()) { return null; } else { // TODO return null; } } }