/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.uhighlight; import java.io.IOException; import java.text.BreakIterator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.search.Query; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.QueryBuilder; public class LengthGoalBreakIteratorTest extends LuceneTestCase { private static final String FIELD = "body"; private static final float[] ALIGNS = {0.f, 0.5f, 1.f}; // We test LengthGoalBreakIterator as it is used by the UnifiedHighlighter instead of directly, because it is // not a general purpose BreakIterator. A unit test of it directly wouldn't give as much confidence. private final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);//whitespace, punctuation, lowercase // We do a '.' BreakIterator and test varying the length goal. // 0 1 // 01234567890123456789 static final String CONTENT = "Aa bb. Cc dd. Ee ff"; static final String CONTENT2 = "Aa bb Cc dd X Ee ff Gg hh."; static final String CONTENT3 = "Aa bbcc ddxyzee ffgg hh."; public void testFragmentAlignmentConstructor() throws IOException { BreakIterator baseBI = new CustomSeparatorBreakIterator('.'); // test fragmentAlignment validation float[] valid_aligns = {0.f, 0.3333f, 0.5f, 0.99f, 1.f}; for (float alignment : valid_aligns) { LengthGoalBreakIterator.createClosestToLength(baseBI, 50, alignment); } float[] invalid_aligns = {-0.01f, -1.f, 1.5f, Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY}; for (float alignment : invalid_aligns) { expectThrows(IllegalArgumentException.class, () -> { LengthGoalBreakIterator.createClosestToLength(baseBI, 50, alignment); }); } // test backwards compatibility constructors String backwardCompString = LengthGoalBreakIterator.createClosestToLength(baseBI, 50).toString(); assertTrue(backwardCompString, backwardCompString.contains("fragAlign=0.0")); backwardCompString = LengthGoalBreakIterator.createMinLength(baseBI, 50).toString(); assertTrue(backwardCompString, backwardCompString.contains("fragAlign=0.0")); } public void testTargetLen() throws IOException { // "goal" means target length goal to find closest break // at first word: Query query = query("aa"); assertEquals("almost two sent A", "<b>Aa</b> bb.", highlightClosestToLen(CONTENT, query, 7, 0.f)); assertEquals("almost two sent B", "<b>Aa</b> bb.", highlightClosestToLen(CONTENT, query, 15, 0.5f)); assertEquals("almost two sent C", "<b>Aa</b> bb.", highlightClosestToLen(CONTENT, query, 64, 1.f)); assertEquals("barely two sent A", "<b>Aa</b> bb. Cc dd.", highlightClosestToLen(CONTENT, query, 8, 0.f)); assertEquals("barely two sent B", "<b>Aa</b> bb. Cc dd.", highlightClosestToLen(CONTENT, query, 16, 0.5f)); assertEquals("long goal A", "<b>Aa</b> bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 14 + random().nextInt(20), 0.f)); assertEquals("long goal B", "<b>Aa</b> bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 28 + random().nextInt(20), 0.5f)); // at some word not at start of passage query = query("dd"); for (float align : ALIGNS) { // alignment is not meaningful if fragsize is shorter than or closer to match-fragment boundaries assertEquals("short goal " + align, " Cc <b>dd</b>.", highlightClosestToLen(CONTENT, query, random().nextInt(4), align)); } // preceding/following inclusion by alignment parameter assertEquals("barely two sent A", " Cc <b>dd</b>. Ee ff", highlightClosestToLen(CONTENT, query, 11, 0.f)); assertEquals("barely two sent B", " Cc <b>dd</b>. Ee ff", highlightClosestToLen(CONTENT, query, 11, 0.5f)); assertEquals("barely two sent C", "Aa bb. Cc <b>dd</b>.", highlightClosestToLen(CONTENT, query, 11, 1.f)); assertEquals("long goal A", " Cc <b>dd</b>. Ee ff", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), 0.f)); assertEquals("long goal B", "Aa bb. Cc <b>dd</b>. Ee ff", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), 0.5f)); assertEquals("long goal C", "Aa bb. Cc <b>dd</b>.", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), 1.f)); query = query("ddxyzee"); assertEquals("test fragment search from the middle of the match; almost including", "<b>ddxyzee</b> ", highlightClosestToLen(CONTENT3, query, 7, 0.5f, 1, ' ')); assertEquals("test fragment search from the middle of the match; barely including", "bbcc <b>ddxyzee</b> ffgg ", highlightClosestToLen(CONTENT3, query, 14, 0.5f, 1, ' ')); } public void testMinLen() throws IOException { // minLen mode is simpler than targetLen... just test a few cases Query query = query("dd"); assertEquals("almost two sent A", " Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 0, 0.f)); assertEquals("almost two sent B", " Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 1, 0.5f)); assertEquals("almost two sent C", " Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 5, 1.f)); assertEquals("barely two sent A", " Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 1, 0.f)); assertEquals("barely two sent B", " Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 2, 0.5f)); assertEquals("barely two sent C", "Aa bb. Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 7, 1.f)); assertEquals("barely two sent D/a", " Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 2, 0.55f)); assertEquals("barely two sent D/b", " Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 3, 0.55f)); assertEquals("barely two sent E/a", " Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 10, 0.5f)); assertEquals("barely two sent E/b", "Aa bb. Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 10, 0.7f)); assertEquals("barely two sent E/c", "Aa bb. Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 9, 0.9f)); query = query("ddxyzee"); assertEquals("test fragment search from the middle of the match; almost including", "<b>ddxyzee</b> ", highlightMinLen(CONTENT3, query, 7, 0.5f, ' ')); assertEquals("test fragment search from the middle of the match; barely including", "bbcc <b>ddxyzee</b> ffgg ", highlightMinLen(CONTENT3, query, 8, 0.5f, ' ')); } public void testMinLenPrecision() throws IOException { Query query = query("x"); assertEquals("test absolute minimal length", "<b>X</b> ", highlightMinLen(CONTENT2, query, 1, 0.5f, ' ')); assertEquals("test slightly above minimal length", "dd <b>X</b> Ee ", highlightMinLen(CONTENT2, query, 4, 0.5f, ' ')); } public void testDefaultSummaryTargetLen() throws IOException { Query query = query("zz"); for (float align : ALIGNS) { // alignment is not used for creating default-summary assertEquals("Aa bb.", highlightClosestToLen(CONTENT, query, 6 + random().nextInt(4), align)); assertEquals("Aa bb. Cc dd.", highlightClosestToLen(CONTENT, query, 12 + random().nextInt(4), align)); assertEquals("Aa bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), align)); } assertEquals("Aa bb. Cc dd.", highlightClosestToLen(CONTENT, query, 6 + random().nextInt(4), 0.f, 2)); } private Query query(String qStr) { return new QueryBuilder(analyzer).createBooleanQuery(FIELD, qStr); } private String highlightClosestToLen(String content, Query query, int lengthGoal, float fragAlign) throws IOException { return highlightClosestToLen(content, query, lengthGoal, fragAlign, 1); } private String highlightClosestToLen(String content, Query query, int lengthGoal, float fragAlign, int maxPassages) throws IOException { return highlightClosestToLen(content, query, lengthGoal, fragAlign, maxPassages, '.'); } private String highlightClosestToLen(String content, Query query, int lengthGoal, float fragAlign, int maxPassages, char separator) throws IOException { UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer); highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createClosestToLength(new CustomSeparatorBreakIterator(separator), lengthGoal, fragAlign)); return highlighter.highlightWithoutSearcher(FIELD, query, content, maxPassages).toString(); } private String highlightMinLen(String content, Query query, int lengthGoal, float fragAlign) throws IOException { return highlightMinLen(content, query, lengthGoal, fragAlign, '.'); } private String highlightMinLen(String content, Query query, int lengthGoal, float fragAlign, char separator) throws IOException { // differs from above only by "createMinLength" UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer); highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createMinLength(new CustomSeparatorBreakIterator(separator), lengthGoal, fragAlign)); return highlighter.highlightWithoutSearcher(FIELD, query, content, 1).toString(); } }