/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.shingle; import java.io.IOException; import java.util.Iterator; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings; public class FixedShingleFilterTest extends BaseTokenStreamTestCase { public void testBiGramFilter() throws IOException { TokenStream ts = new CannedTokenStream( new Token("please", 0, 6), new Token("divide", 7, 13), new Token("this", 14, 18), new Token("sentence", 19, 27), new Token("into", 28, 32), new Token("shingles", 33, 41) ); assertTokenStreamContents(new FixedShingleFilter(ts, 2), new String[]{"please divide", "divide this", "this sentence", "sentence into", "into shingles"}, new int[]{0, 7, 14, 19, 28,}, new int[]{13, 18, 27, 32, 41,}, new String[]{"shingle", "shingle", "shingle", "shingle", "shingle",}, new int[]{1, 1, 1, 1, 1,}, new int[]{1, 1, 1, 1, 1}); } public void testBiGramFilterWithAltSeparator() throws IOException { TokenStream ts = new CannedTokenStream( new Token("please", 0, 6), new Token("divide", 7, 13), new Token("this", 14, 18), new Token("sentence", 19, 27), new Token("into", 28, 32), new Token("shingles", 33, 41) ); assertTokenStreamContents(new FixedShingleFilter(ts, 2, "<SEP>", "_"), new String[]{"please<SEP>divide", "divide<SEP>this", "this<SEP>sentence", "sentence<SEP>into", "into<SEP>shingles"}, new int[]{0, 7, 14, 19, 28}, new int[]{13, 18, 27, 32, 41}, new String[]{"shingle", "shingle", "shingle", "shingle", "shingle"}, new int[]{1, 1, 1, 1, 1}); } public void testTriGramFilter() throws IOException { TokenStream ts = new CannedTokenStream( new Token("please", 0, 6), new Token("divide", 7, 13), new Token("this", 14, 18), new Token("sentence", 19, 27), new Token("into", 28, 32), new Token("shingles", 33, 41) ); assertTokenStreamContents(new FixedShingleFilter(ts, 3), new String[]{"please divide this", "divide this sentence", "this sentence into", "sentence into shingles"}); } public void testShingleSizeGreaterThanTokenstreamLength() throws IOException { TokenStream ts = new FixedShingleFilter(new CannedTokenStream( new Token("please", 0, 6), new Token("divide", 7, 13) ), 3); ts.reset(); assertFalse(ts.incrementToken()); } public void testWithStopwords() throws IOException { TokenStream ts = new CannedTokenStream( new Token("please", 0, 6), new Token("divide", 7, 13), new Token("sentence", 2, 19, 27), new Token("shingles", 2, 33, 41) ); assertTokenStreamContents(new FixedShingleFilter(ts, 3), new String[]{"please divide _", "divide _ sentence", "sentence _ shingles"}, new int[]{0, 7, 19,}, new int[]{13, 27, 41,}, new String[]{"shingle", "shingle", "shingle",}, new int[]{1, 1, 2,}); } public void testConsecutiveStopwords() throws IOException { TokenStream ts = new CannedTokenStream( new Token("b", 2, 2, 3), new Token("c", 4, 5), new Token("d", 6, 7), new Token("b", 3, 12, 13), new Token("c", 14, 15) ); assertTokenStreamContents(new FixedShingleFilter(ts, 4), new String[]{"b c d _", "c d _ _", "d _ _ b"}, new int[]{2, 4, 6,}, new int[]{7, 7, 13,}, new int[]{2, 1, 1,}); } public void testTrailingStopwords() throws IOException { TokenStream ts = new CannedTokenStream(1, 7, new Token("b", 0, 1), new Token("c", 2, 3), new Token("d", 4, 5) ); assertTokenStreamContents(new FixedShingleFilter(ts, 3), new String[] { "b c d", "c d _" }, new int[] { 0, 2, }, new int[] { 5, 5, }, new int[] { 1, 1, }); } public void testMultipleTrailingStopwords() throws IOException { TokenStream ts = new CannedTokenStream(2, 9, new Token("b", 0, 1), new Token("c", 2, 3), new Token("d", 4, 5) ); assertTokenStreamContents(new FixedShingleFilter(ts, 3), new String[] { "b c d", "c d _", "d _ _" }, new int[] { 0, 2, 4 }, new int[] { 5, 5, 5 }, new int[] { 1, 1, 1 }); } public void testIncomingGraphs() throws IOException { // b/a c b/a d TokenStream ts = new CannedTokenStream( new Token("b", 0, 1), new Token("a", 0, 0, 1), new Token("c", 2, 3), new Token("b", 4, 5), new Token("a", 0, 4, 5), new Token("d", 6, 7) ); assertTokenStreamContents(new FixedShingleFilter(ts, 2), new String[] { "b c", "a c", "c b", "c a", "b d", "a d" }, new int[] { 0, 0, 2, 2, 4, 4 }, new int[] { 3, 3, 5, 5, 7, 7 }, new int[] { 1, 0, 1, 0, 1, 0 }); } public void testShinglesSpanningGraphs() throws IOException { TokenStream ts = new CannedTokenStream( new Token("b", 0, 1), new Token("a", 0, 0, 1), new Token("c", 2, 3), new Token("b", 4, 5), new Token("a", 0, 4, 5), new Token("d", 6, 7) ); assertTokenStreamContents(new FixedShingleFilter(ts, 3), new String[] { "b c b", "b c a", "a c b", "a c a", "c b d", "c a d" }, new int[] { 0, 0, 0, 0, 2, 2, }, new int[] { 5, 5, 5, 5, 7, 7, }, new int[] { 1, 0, 0, 0, 1, 0, }); } public void testTrailingGraphsOfDifferingLengths() throws IOException { // a b:3/c d e f TokenStream ts = new CannedTokenStream( new Token("a", 0, 1), new Token("b", 1, 2, 3, 3), new Token("c", 0, 2, 3), new Token("d", 2, 3), new Token("e", 2, 3), new Token("f", 4, 5) ); assertTokenStreamContents(new FixedShingleFilter(ts, 3), new String[]{ "a b f", "a c d", "c d e", "d e f"}); } public void testParameterLimits() { IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> { new FixedShingleFilter(new CannedTokenStream(), 1); }); assertEquals("Shingle size must be between 2 and 4, got 1", e.getMessage()); IllegalArgumentException e2 = expectThrows(IllegalArgumentException.class, () -> { new FixedShingleFilter(new CannedTokenStream(), 5); }); assertEquals("Shingle size must be between 2 and 4, got 5", e2.getMessage()); } public void testWithGraphInput() throws IOException { TokenStream ts = new CannedTokenStream( new Token("fuz", 0, 3), new Token("foo", 1, 4, 6, 2), new Token("bar", 0, 4, 6), new Token("baz", 1, 4, 6) ); GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(ts); Iterator<TokenStream> it = graph.getFiniteStrings(); assertTokenStreamContents(new FixedShingleFilter(it.next(), 2), new String[]{ "fuz foo"}); assertTokenStreamContents(new FixedShingleFilter(it.next(), 2), new String[]{ "fuz bar", "bar baz"}); } }