/** * Palmetto - Palmetto is a quality measure tool for topics. * Copyright © 2014 Data Science Group (DICE) ([email protected]) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.aksw.palmetto.corpus.lucene.creation; import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.Iterator; import org.aksw.palmetto.Palmetto; import org.aksw.palmetto.corpus.lucene.WindowSupportingLuceneCorpusAdapter; import org.aksw.palmetto.data.CountedSubsets; import org.aksw.palmetto.data.SegmentationDefinition; import org.aksw.palmetto.prob.window.BooleanSlidingWindowFrequencyDeterminer; import org.apache.commons.io.FileUtils; import org.apache.lucene.index.CorruptIndexException; import org.junit.Assert; import org.junit.Test; import com.carrotsearch.hppc.IntArrayList; import com.carrotsearch.hppc.IntIntOpenHashMap; import com.carrotsearch.hppc.IntObjectOpenHashMap; public class PositionStoringLuceneIndexCreatorTest { public static IndexableDocument DOCUMENTS[] = { new IndexableDocument("This is a test document.", 5), new IndexableDocument("This is another test document.", 5), new IndexableDocument("This is the third.", 4) }; public static int WINDOW_SIZE = 4; public static String TEST_WORDS[] = { "is", "document", "dog" }; // expected positions of the words inside the documents (negativ number // means that the word is not inside the // document public static int EXPECTED_WORD_POSITIONS[][] = new int[][] { { 1, 4, -1 }, { 1, 4, -1 }, { 1, -1, -1 } }; // counts for windows of size 4 containing { ALWAYS 0, "is", "document", // "is"+"document", "dog", "is"+"dog", "document"+"dog", // "is"+"document"+"dog"} public static int EXPECTED_COUNTS[] = new int[] { 0, 5, 2, 2, 0, 0, 0, 0 }; @Test public void test() throws CorruptIndexException, IOException { File indexDir = new File( FileUtils.getTempDirectoryPath() + File.separator + "temp_index" + Long.toString(System.nanoTime())); Assert.assertTrue(indexDir.mkdir()); Iterator<IndexableDocument> docIterator = Arrays.asList(DOCUMENTS).iterator(); // create the index PositionStoringLuceneIndexCreator creator = new PositionStoringLuceneIndexCreator( Palmetto.DEFAULT_TEXT_INDEX_FIELD_NAME, Palmetto.DEFAULT_DOCUMENT_LENGTH_INDEX_FIELD_NAME); Assert.assertTrue(creator.createIndex(indexDir, docIterator)); LuceneIndexHistogramCreator hCreator = new LuceneIndexHistogramCreator( Palmetto.DEFAULT_DOCUMENT_LENGTH_INDEX_FIELD_NAME); hCreator.createLuceneIndexHistogram(indexDir.getAbsolutePath()); // test the created index // create an adapter WindowSupportingLuceneCorpusAdapter adapter = null; try { adapter = WindowSupportingLuceneCorpusAdapter.create(indexDir.getAbsolutePath(), Palmetto.DEFAULT_TEXT_INDEX_FIELD_NAME, Palmetto.DEFAULT_DOCUMENT_LENGTH_INDEX_FIELD_NAME); // query the test words IntIntOpenHashMap docLengths = new IntIntOpenHashMap(); IntObjectOpenHashMap<IntArrayList[]> wordPositions = adapter.requestWordPositionsInDocuments(TEST_WORDS, docLengths); // compare the result with the expected counts int positionInDoc; IntArrayList[] positionsInDocs; for (int i = 0; i < EXPECTED_WORD_POSITIONS.length; ++i) { positionsInDocs = wordPositions.get(i); for (int j = 0; j < positionsInDocs.length; ++j) { if (EXPECTED_WORD_POSITIONS[i][j] < 0) { Assert.assertNull("Expected null because the word \"" + TEST_WORDS[j] + "\" shouldn't be found inside document " + i + ". But got a position list instead.", positionsInDocs[j]); } else { Assert.assertEquals(1, positionsInDocs[j].elementsCount); positionInDoc = positionsInDocs[j].buffer[0]; Assert.assertEquals("Expected the word \"" + TEST_WORDS[j] + "\" in document " + i + " at position " + EXPECTED_WORD_POSITIONS[i][j] + " but got position " + positionInDoc + " form the index.", EXPECTED_WORD_POSITIONS[i][j], positionInDoc); } } } // test the window based counting BooleanSlidingWindowFrequencyDeterminer determiner = new BooleanSlidingWindowFrequencyDeterminer(adapter, WINDOW_SIZE); CountedSubsets subsets = determiner.determineCounts(new String[][] { TEST_WORDS }, new SegmentationDefinition[] { new SegmentationDefinition(new int[0], new int[0][0], null) })[0]; Assert.assertArrayEquals(EXPECTED_COUNTS, subsets.counts); } finally { if (adapter != null) { adapter.close(); } } } }