/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.luke.models.documents; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Objects; import java.util.OptionalInt; import java.util.stream.Collectors; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.luke.util.BytesRefUtils; /** * Holder for term vector entry representing the term and their number of occurrences, and optionally, positions in the document field. */ public final class TermVectorEntry { private final String termText; private final long freq; private final List<TermVectorPosition> positions; /** * Returns a new term vector entry representing the specified term, and optionally, positions. * * @param te - positioned terms iterator * @return term vector entry * @throws IOException - if there is a low level IO error. */ static TermVectorEntry of(TermsEnum te) throws IOException { Objects.requireNonNull(te); String termText = BytesRefUtils.decode(te.term()); List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>(); PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS); pe.nextDoc(); int freq = pe.freq(); for (int i = 0; i < freq; i++) { int pos = pe.nextPosition(); if (pos < 0) { // no position information available continue; } TermVectorPosition tvPos = TermVectorPosition.of(pos, pe); tvPositions.add(tvPos); } return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions); } private TermVectorEntry(String termText, long freq, List<TermVectorPosition> positions) { this.termText = termText; this.freq = freq; this.positions = positions; } /** * Returns the string representation for this term. */ public String getTermText() { return termText; } /** * Returns the number of occurrences of this term in the document field. */ public long getFreq() { return freq; } /** * Returns the list of positions for this term in the document field. */ public List<TermVectorPosition> getPositions() { return positions; } @Override public String toString() { String positionsStr = positions.stream() .map(TermVectorPosition::toString) .collect(Collectors.joining(",")); return "TermVectorEntry{" + "termText='" + termText + '\'' + ", freq=" + freq + ", positions=" + positionsStr + '}'; } /** * Holder for position information for a term vector entry. */ public static final class TermVectorPosition { private final int position; private final int startOffset; private final int endOffset; /** * Returns a new position entry representing the specified posting, and optionally, start and end offsets. * @param pos - term position * @param pe - positioned postings iterator * @return position entry * @throws IOException - if there is a low level IO error. */ static TermVectorPosition of(int pos, PostingsEnum pe) throws IOException { Objects.requireNonNull(pe); int sOffset = pe.startOffset(); int eOffset = pe.endOffset(); if (sOffset >= 0 && eOffset >= 0) { return new TermVectorPosition(pos, sOffset, eOffset); } return new TermVectorPosition(pos); } /** * Returns the position for this term in the document field. */ public int getPosition() { return position; } /** * Returns the start offset for this term in the document field. * Empty Optional instance is returned if no offset information available. */ public OptionalInt getStartOffset() { return startOffset >= 0 ? OptionalInt.of(startOffset) : OptionalInt.empty(); } /** * Returns the end offset for this term in the document field. * Empty Optional instance is returned if no offset information available. */ public OptionalInt getEndOffset() { return endOffset >= 0 ? OptionalInt.of(endOffset) : OptionalInt.empty(); } @Override public String toString() { return "TermVectorPosition{" + "position=" + position + ", startOffset=" + startOffset + ", endOffset=" + endOffset + '}'; } private TermVectorPosition(int position) { this(position, -1, -1); } private TermVectorPosition(int position, int startOffset, int endOffset) { this.position = position; this.startOffset = startOffset; this.endOffset = endOffset; } } }