/******************************************************************************* * Copyright 2014 A3 lab (Dipartimento di Informatica, Università di Pisa) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package it.acubelab.tagme.preprocessing.anchors; import it.acubelab.Chars; import it.acubelab.ExternalSortUtils; import it.acubelab.PLogger; import it.acubelab.PLogger.Step; import it.acubelab.tagme.Anchor; import it.acubelab.tagme.config.TagmeConfig; import it.acubelab.tagme.config.Config.RepositoryDirs; import it.acubelab.tagme.preprocessing.Dataset; import it.acubelab.tagme.preprocessing.Indexer; import it.acubelab.tagme.preprocessing.Indexes; import it.acubelab.tagme.preprocessing.TextDataset; import it.acubelab.tagme.preprocessing.WikipediaIndexer; import it.acubelab.tagme.preprocessing.support.PeopleWIDs; import it.unimi.dsi.fastutil.ints.Int2IntMap; import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; import it.unimi.dsi.fastutil.ints.IntArraySet; import it.unimi.dsi.fastutil.ints.IntSet; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.lang.MutableString; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.BitSet; import java.util.HashSet; import java.util.Set; import org.apache.commons.io.FileUtils; import org.apache.log4j.Logger; import org.apache.lucene.analysis.KeywordAnalyzer; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TotalHitCountCollector; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; public class AnchorIndexer extends Indexer { Logger log = Logger.getLogger(AnchorIndexer.class); public static final String FIELD_ID = "id", FIELD_TEXT = "text", FIELD_ORIGINAL = "original", FIELD_OBJECT = "obj", FIELD_WID = "wid"; public AnchorIndexer() { } //per ottimizzare le ricerche, se sono su un server ben messo posso //caricare in memoria tutto l'indice! private static double GAP_FACTOR = 0.8; private IndexSearcher openWikipediaIndex(String lang) throws IOException{ File indexDir = RepositoryDirs.WIKIPEDIA.getDir(lang); long indexSize = FileUtils.sizeOfDirectory(indexDir); long maxMemory = Runtime.getRuntime().maxMemory(); if (indexSize < maxMemory * GAP_FACTOR){ log.info("MaxMemory is enough, loading Wikipedia index..."); IndexReader r = IndexReader.open(new RAMDirectory(FSDirectory.open(indexDir)), true); log.info("WikipediaIndex loaded."); return new IndexSearcher(r); } else { log.info("Not enough memory ["+maxMemory/1000000+"Mb] to load WikipediaIndex (about "+indexSize/1000000+"Mb)"); return Indexes.getSearcher(RepositoryDirs.WIKIPEDIA.getPath(lang)); } } @Override public void makeIndex(String lang, File workingDir) throws IOException { log.info("Loading support datasets..."); File all_anchors = new WikipediaAnchorParser(lang).getFile(); long numAnchors = ExternalSortUtils.wcl(all_anchors); AnchorIterator iterator = new AnchorIterator(all_anchors); IntSet people = new PeopleWIDs(lang).getDataset(); // IndexSearcher articles = Indexes.getSearcher(RepositoryDirs.WIKIPEDIA.getPath(lang)); IndexSearcher articles = openWikipediaIndex(lang); //QueryParser queryParser = new QueryParser(Version.LUCENE_34, WikipediaIndexer.FIELD_BODY, new WhitespaceAnalyzer(Version.LUCENE_34)); QueryParser queryParser = new QueryParser(Version.LUCENE_34, WikipediaIndexer.FIELD_BODY, new StandardAnalyzer(Version.LUCENE_34, new HashSet<String>())); IndexWriter index = new IndexWriter(FSDirectory.open(workingDir.getAbsoluteFile()), new IndexWriterConfig(Version.LUCENE_34, new KeywordAnalyzer())); Document doc = new Document(); Field fId = new Field(FIELD_ID, "", Store.YES, Index.NOT_ANALYZED); Field fText = new Field(FIELD_TEXT, "", Store.YES, Index.NOT_ANALYZED); Field fObject = new Field(FIELD_OBJECT, "", Store.YES, Index.NO); doc.add(fId); doc.add(fText); doc.add(fObject); // Field fOriginal = new Field(FIELD_ORIGINAL, "", Store.YES, Index.ANALYZED); // Field fWID = new Field(FIELD_WID, "", Store.NO, Index.ANALYZED); PLogger plog = new PLogger(log, Step.TEN_MINUTES, "lines", "anchors", "searches", "indexed", "0-freq","dropped"); plog.setEnd(0, numAnchors); plog.start("Support datasets loaded, now parsing..."); int id=0; while(iterator.next()) { plog.update(0, iterator.scroll); plog.update(1); String anchorText = iterator.anchor; int freq = freq(iterator.originals, articles, queryParser); plog.update(2, iterator.originals.size()); if (freq == 0) plog.update(4); Anchor anchorObj = Anchor.build(id, iterator.links, freq, people); if (anchorObj == null){ plog.update(5); continue; } String anchorSerial = Anchor.serialize(anchorObj); fId.setValue(Integer.toString(++id)); fText.setValue(anchorText); fObject.setValue(anchorSerial); for(int page : anchorObj){ Field fWID = new Field(FIELD_WID, Integer.toString(page), Store.YES, Index.NOT_ANALYZED); // fWID.setBoost(iterator.links.get(page)); doc.add(fWID); } for(String original : iterator.originals) { doc.add(new Field(FIELD_ORIGINAL, original, Store.YES, Index.NOT_ANALYZED)); } index.addDocument(doc); plog.update(3); doc.removeFields(FIELD_ORIGINAL); doc.removeFields(FIELD_WID); } plog.stop(); iterator.close(); log.info("Now optimizing..."); index.optimize(); index.close(); log.info("Done."); } static final String QUERY_PATTERN = "\"%s\""; static int freq(Set<String> anchors, IndexSearcher index, QueryParser queryParser) throws IOException { //int sum = 0; BitSet bits = new BitSet(index.maxDoc()); for(String a : anchors) { try { Query q = queryParser.parse(String.format(QUERY_PATTERN, QueryParser.escape(a))); TotalHitCountCollectorSet results = new TotalHitCountCollectorSet(bits); index.search(q, results); //sum += results.getTotalHits(); } catch (ParseException e) { } } return bits.cardinality(); } @Override public File getIndexDir(String lang) { return RepositoryDirs.ANCHORS.getDir(lang); } private static class TotalHitCountCollectorSet extends TotalHitCountCollector{ private BitSet set; private int docBase; public TotalHitCountCollectorSet(BitSet set) { super(); this.set =set; } public boolean acceptsDocsOutOfOrder(){ return true; } public void setNextReader(IndexReader reader,int docBase){ this.docBase=docBase; } public void collect(int doc){ super.collect(doc); set.set(doc+docBase); } } /** * Anchor Iterator * */ private static class AnchorIterator { String anchor; String lastAnchor; Int2IntMap links; Set<String> originals; FastBufferedReader in; MutableString line; int scroll; boolean end; public AnchorIterator(File inputFile) throws IOException { anchor = null; links = new Int2IntOpenHashMap(1024); links.defaultReturnValue(0); originals = new HashSet<String>(32); in = new FastBufferedReader(new InputStreamReader(new FileInputStream(inputFile), Charset.forName("UTF-8"))); line = new MutableString(1024); in.readLine(line); lastAnchor = Chars.split(line, TextDataset.SEP_CHAR)[0].toString(); scroll = 1; end = false; } public boolean next() throws IOException { //Verifichiamo se siamo giunti alla fine del file. if (end) return false; links.clear(); originals.clear(); scroll = 0; while(true) { CharSequence[] fields = Chars.split(line, TextDataset.SEP_CHAR); if (fields[0].equals(lastAnchor)) { //Aggiungiamo un nuovo original. originals.add(fields[1].toString()); //Prendiamo la target page int targetpage = Chars.parseInt(fields[2]); //Aumentiamo di uno il numero di link alla target page (Da notare che se non esiste nella map links.get(targetpage) restiuisce 0. links.put(targetpage, links.get(targetpage)+1); }else{ //Nuova ancora anchor = lastAnchor; lastAnchor = fields[0].toString(); break; } scroll++; if (in.readLine(line)==null){ end = true; break; } }; return true; } void close() throws IOException { in.close(); } } /** * End of AnchorIterator * */ }